3 DEFAULT_IMAGE
='docker.io/ceph/ceph:v15'
4 DATA_DIR
='/var/lib/ceph'
5 LOG_DIR
='/var/log/ceph'
6 LOCK_DIR
='/run/cephadm'
7 LOGROTATE_DIR
='/etc/logrotate.d'
8 UNIT_DIR
='/etc/systemd/system'
11 CONTAINER_PREFERENCE
= ['podman', 'docker'] # prefer podman to docker
12 CUSTOM_PS1
=r
'[ceph: \u@\h \W]\$ '
13 DEFAULT_TIMEOUT
=None # in seconds
15 SHELL_DEFAULT_CONF
='/etc/ceph/ceph.conf'
16 SHELL_DEFAULT_KEYRING
='/etc/ceph/ceph.client.admin.keyring'
19 You can invoke cephadm in two ways:
21 1. The normal way, at the command line.
23 2. By piping the script to the python3 binary. In this latter case, you should
24 prepend one or more lines to the beginning of the script.
32 injected_argv = ['ls']
34 For reading stdin from the '--config-json -' argument,
36 injected_stdin = '...'
58 from typing
import Dict
, List
, Tuple
, Optional
, Union
, Any
, NoReturn
, Callable
63 from functools
import wraps
65 from threading
import Thread
67 if sys
.version_info
>= (3, 0):
68 from io
import StringIO
70 from StringIO
import StringIO
72 if sys
.version_info
>= (3, 2):
73 from configparser
import ConfigParser
75 from ConfigParser
import SafeConfigParser
77 if sys
.version_info
>= (3, 0):
78 from urllib
.request
import urlopen
79 from urllib
.error
import HTTPError
81 from urllib2
import urlopen
, HTTPError
86 DATEFMT
= '%Y-%m-%dT%H:%M:%S.%f'
88 class Error(Exception):
91 class TimeoutExpired(Error
):
94 ##################################
97 daemons
= ('mon', 'mgr', 'mds', 'osd', 'rgw', 'rbd-mirror',
100 ##################################
102 class Monitoring(object):
103 """Define the configs for the monitoring containers"""
106 "prometheus": [9095], # Avoid default 9090, due to conflict with cockpit UI
107 "node-exporter": [9100],
109 "alertmanager": [9093, 9094],
114 "image": "prom/prometheus:latest",
118 "--config.file=/etc/prometheus/prometheus.yml",
119 "--storage.tsdb.path=/prometheus",
120 "--web.listen-address=:{}".format(port_map
['prometheus'][0]),
122 "config-json-files": [
127 "image": "prom/node-exporter",
131 "--no-collector.timex",
135 "image": "ceph/ceph-grafana:latest",
139 "config-json-files": [
141 "provisioning/datasources/ceph-dashboard.yml",
147 "image": "prom/alertmanager",
151 "config-json-files": [
154 "config-json-args": [
160 ##################################
162 class NFSGanesha(object):
163 """Defines a NFS-Ganesha container"""
166 entrypoint
= '/usr/bin/ganesha.nfsd'
167 daemon_args
= ['-F', '-L', 'STDERR']
169 required_files
= ['ganesha.conf']
179 image
=DEFAULT_IMAGE
):
180 # type: (str, Union[int, str], Dict, str) -> None
182 self
.daemon_id
= daemon_id
185 def json_get(key
, default
=None, require
=False):
186 if require
and not key
in config_json
.keys():
187 raise Error('{} missing from config-json'.format(key
))
188 return config_json
.get(key
, default
)
190 # config-json options
191 self
.pool
= json_get('pool', require
=True)
192 self
.namespace
= json_get('namespace')
193 self
.files
= json_get('files', {})
195 # validate the supplied args
199 def init(cls
, fsid
, daemon_id
):
200 # type: (str, Union[int, str]) -> NFSGanesha
201 return cls(fsid
, daemon_id
, get_parm(args
.config_json
), args
.image
)
206 for (srv
, port
) in NFSGanesha
.port_map
.items():
207 if port_in_use(port
):
208 msg
= 'TCP port {} required for {} is already in use'.format(port
, srv
)
212 def get_container_mounts(data_dir
):
213 # type: (str) -> Dict[str, str]
215 mounts
[os
.path
.join(data_dir
, 'config')] = '/etc/ceph/ceph.conf:z'
216 mounts
[os
.path
.join(data_dir
, 'keyring')] = '/etc/ceph/keyring:z'
217 mounts
[os
.path
.join(data_dir
, 'etc/ganesha')] = '/etc/ganesha:z'
221 def get_container_envs():
222 # type: () -> List[str]
224 'CEPH_CONF=%s' % ('/etc/ceph/ceph.conf')
229 def get_version(container_id
):
230 # type(str) -> Optional[str]
232 out
, err
, code
= call(
233 [container_path
, 'exec', container_id
,
234 NFSGanesha
.entrypoint
, '-v'])
236 match
= re
.search(r
'NFS-Ganesha Release\s*=\s*[V]*([\d.]+)', out
)
238 version
= match
.group(1)
243 if not is_fsid(self
.fsid
):
244 raise Error('not an fsid: %s' % self
.fsid
)
245 if not self
.daemon_id
:
246 raise Error('invalid daemon_id: %s' % self
.daemon_id
)
248 raise Error('invalid image: %s' % self
.image
)
250 # check for the required files
251 if self
.required_files
:
252 for fname
in self
.required_files
:
253 if fname
not in self
.files
:
254 raise Error('required file missing from config-json: %s' % fname
)
256 def get_daemon_name(self
):
258 return '%s.%s' % (self
.daemon_type
, self
.daemon_id
)
260 def get_container_name(self
, desc
=None):
261 # type: (Optional[str]) -> str
262 cname
= 'ceph-%s-%s' % (self
.fsid
, self
.get_daemon_name())
264 cname
= '%s-%s' % (cname
, desc
)
267 def get_file_content(self
, fname
):
269 """Normalize the json file content into a string"""
270 content
= self
.files
.get(fname
)
271 if isinstance(content
, list):
272 content
= '\n'.join(content
)
275 def create_daemon_dirs(self
, data_dir
, uid
, gid
):
276 # type: (str, int, int) -> None
277 """Create files under the container data dir"""
278 if not os
.path
.isdir(data_dir
):
279 raise OSError('data_dir is not a directory: %s' % (data_dir
))
281 logger
.info('Creating ganesha config...')
283 # create the ganesha conf dir
284 config_dir
= os
.path
.join(data_dir
, 'etc/ganesha')
285 makedirs(config_dir
, uid
, gid
, 0o755)
287 # populate files from the config-json
288 for fname
in self
.files
:
289 config_file
= os
.path
.join(config_dir
, fname
)
290 config_content
= self
.get_file_content(fname
)
291 logger
.info('Write file: %s' % (config_file
))
292 with
open(config_file
, 'w') as f
:
293 os
.fchown(f
.fileno(), uid
, gid
)
294 os
.fchmod(f
.fileno(), 0o600)
295 f
.write(config_content
)
297 def get_rados_grace_container(self
, action
):
298 # type: (str) -> CephContainer
299 """Container for a ganesha action on the grace db"""
300 entrypoint
= '/usr/bin/ganesha-rados-grace'
303 args
=['--pool', self
.pool
]
305 args
+= ['--ns', self
.namespace
]
306 args
+= [action
, self
.get_daemon_name()]
308 data_dir
= get_data_dir(self
.fsid
, self
.daemon_type
, self
.daemon_id
)
309 volume_mounts
= self
.get_container_mounts(data_dir
)
310 envs
= self
.get_container_envs()
312 logger
.info('Creating RADOS grace for action: %s' % (action
))
315 entrypoint
=entrypoint
,
317 volume_mounts
=volume_mounts
,
318 cname
=self
.get_container_name(desc
='grace-%s' % (action
)),
323 ##################################
325 def get_supported_daemons():
326 supported_daemons
= list(Ceph
.daemons
)
327 supported_daemons
.extend(Monitoring
.components
)
328 supported_daemons
.append(NFSGanesha
.daemon_type
)
329 assert len(supported_daemons
) == len(set(supported_daemons
))
330 return supported_daemons
332 ##################################
334 def attempt_bind(s
, address
, port
):
337 s
.setsockopt(socket
.SOL_SOCKET
, socket
.SO_REUSEADDR
, 1)
338 s
.bind((address
, port
))
339 except (socket
.error
, OSError) as e
: # py2 and py3
340 msg
= 'Cannot bind to IP %s port %d: %s' % (address
, port
, e
)
342 if e
.errno
== errno
.EADDRINUSE
:
344 elif e
.errno
== errno
.EADDRNOTAVAIL
:
349 def port_in_use(port_num
):
351 """Detect whether a port is in use on the local machine - IPv4 and IPv6"""
352 logger
.info('Verifying port %d ...' % (port_num
))
354 s
= socket
.socket(socket
.AF_INET
, socket
.SOCK_STREAM
)
355 attempt_bind(s
, '0.0.0.0', port_num
)
357 s
= socket
.socket(socket
.AF_INET6
, socket
.SOCK_STREAM
)
358 attempt_bind(s
, '::', port_num
)
364 def check_ip_port(ip
, port
):
365 if not args
.skip_ping_check
:
366 logger
.info('Verifying IP %s port %d ...' % (ip
, port
))
367 if ip
.startswith('[') or '::' in ip
:
368 s
= socket
.socket(socket
.AF_INET6
, socket
.SOCK_STREAM
)
370 s
= socket
.socket(socket
.AF_INET
, socket
.SOCK_STREAM
)
372 attempt_bind(s
, ip
, port
)
376 ##################################
378 # this is an abbreviated version of
379 # https://github.com/benediktschmitt/py-filelock/blob/master/filelock.py
380 # that drops all of the compatibility (this is Unix/Linux only).
385 TimeoutError
= OSError
387 class Timeout(TimeoutError
):
389 Raised when the lock could not be acquired in *timeout*
393 def __init__(self
, lock_file
):
396 #: The path of the file lock.
397 self
.lock_file
= lock_file
401 temp
= "The file lock '{}' could not be acquired."\
402 .format(self
.lock_file
)
406 class _Acquire_ReturnProxy(object):
407 def __init__(self
, lock
):
414 def __exit__(self
, exc_type
, exc_value
, traceback
):
419 class FileLock(object):
420 def __init__(self
, name
, timeout
= -1):
421 if not os
.path
.exists(LOCK_DIR
):
422 os
.mkdir(LOCK_DIR
, 0o700)
423 self
._lock
_file
= os
.path
.join(LOCK_DIR
, name
+ '.lock')
425 # The file descriptor for the *_lock_file* as it is returned by the
426 # os.open() function.
427 # This file lock is only NOT None, if the object currently holds the
429 self
._lock
_file
_fd
= None
430 self
.timeout
= timeout
431 # The lock counter is used for implementing the nested locking
432 # mechanism. Whenever the lock is acquired, the counter is increased and
433 # the lock is only released, when this value is 0 again.
434 self
._lock
_counter
= 0
439 return self
._lock
_file
_fd
is not None
441 def acquire(self
, timeout
=None, poll_intervall
=0.05):
443 Acquires the file lock or fails with a :exc:`Timeout` error.
444 .. code-block:: python
445 # You can use this method in the context manager (recommended)
448 # Or use an equivalent try-finally construct:
455 The maximum time waited for the file lock.
456 If ``timeout < 0``, there is no timeout and this method will
457 block until the lock could be acquired.
458 If ``timeout`` is None, the default :attr:`~timeout` is used.
459 :arg float poll_intervall:
460 We check once in *poll_intervall* seconds if we can acquire the
463 if the lock could not be acquired in *timeout* seconds.
464 .. versionchanged:: 2.0.0
465 This method returns now a *proxy* object instead of *self*,
466 so that it can be used in a with statement without side effects.
468 # Use the default timeout, if no timeout is provided.
470 timeout
= self
.timeout
472 # Increment the number right at the beginning.
473 # We can still undo it, if something fails.
474 self
._lock
_counter
+= 1
477 lock_filename
= self
._lock
_file
478 start_time
= time
.time()
481 if not self
.is_locked
:
482 logger
.debug('Acquiring lock %s on %s', lock_id
,
487 logger
.debug('Lock %s acquired on %s', lock_id
,
490 elif timeout
>= 0 and time
.time() - start_time
> timeout
:
491 logger
.warning('Timeout acquiring lock %s on %s', lock_id
,
493 raise Timeout(self
._lock
_file
)
496 'Lock %s not acquired on %s, waiting %s seconds ...',
497 lock_id
, lock_filename
, poll_intervall
499 time
.sleep(poll_intervall
)
501 # Something did go wrong, so decrement the counter.
502 self
._lock
_counter
= max(0, self
._lock
_counter
- 1)
505 return _Acquire_ReturnProxy(lock
= self
)
507 def release(self
, force
= False):
509 Releases the file lock.
510 Please note, that the lock is only completly released, if the lock
512 Also note, that the lock file itself is not automatically deleted.
514 If true, the lock counter is ignored and the lock is released in
518 self
._lock
_counter
-= 1
520 if self
._lock
_counter
== 0 or force
:
522 lock_filename
= self
._lock
_file
524 logger
.debug('Releasing lock %s on %s', lock_id
, lock_filename
)
526 self
._lock
_counter
= 0
527 logger
.debug('Lock %s released on %s', lock_id
, lock_filename
)
535 def __exit__(self
, exc_type
, exc_value
, traceback
):
540 self
.release(force
= True)
545 open_mode
= os
.O_RDWR | os
.O_CREAT | os
.O_TRUNC
546 fd
= os
.open(self
._lock
_file
, open_mode
)
549 fcntl
.flock(fd
, fcntl
.LOCK_EX | fcntl
.LOCK_NB
)
550 except (IOError, OSError):
553 self
._lock
_file
_fd
= fd
557 # Do not remove the lockfile:
559 # https://github.com/benediktschmitt/py-filelock/issues/31
560 # https://stackoverflow.com/questions/17708885/flock-removing-locked-file-without-race-condition
561 fd
= self
._lock
_file
_fd
562 self
._lock
_file
_fd
= None
563 fcntl
.flock(fd
, fcntl
.LOCK_UN
)
568 ##################################
569 # Popen wrappers, lifted from ceph-volume
571 def call(command
, # type: List[str]
572 desc
=None, # type: Optional[str]
573 verbose
=False, # type: bool
574 verbose_on_failure
=True, # type: bool
575 timeout
=DEFAULT_TIMEOUT
, # type: Optional[int]
578 Wrap subprocess.Popen to
580 - log stdout/stderr to a logger,
582 - cleanly return out, err, returncode
584 If verbose=True, log at info (instead of debug) level.
586 :param verbose_on_failure: On a non-zero exit status, it will forcefully set
587 logging ON for the terminal
588 :param timeout: timeout in seconds
592 timeout
= timeout
or args
.timeout
594 logger
.debug("Running command: %s" % ' '.join(command
))
595 process
= subprocess
.Popen(
597 stdout
=subprocess
.PIPE
,
598 stderr
=subprocess
.PIPE
,
602 # get current p.stdout flags, add O_NONBLOCK
603 assert process
.stdout
is not None
604 assert process
.stderr
is not None
605 stdout_flags
= fcntl
.fcntl(process
.stdout
, fcntl
.F_GETFL
)
606 stderr_flags
= fcntl
.fcntl(process
.stderr
, fcntl
.F_GETFL
)
607 fcntl
.fcntl(process
.stdout
, fcntl
.F_SETFL
, stdout_flags | os
.O_NONBLOCK
)
608 fcntl
.fcntl(process
.stderr
, fcntl
.F_SETFL
, stderr_flags | os
.O_NONBLOCK
)
614 out_buffer
= '' # partial line (no newline yet)
615 err_buffer
= '' # partial line (no newline yet)
616 start_time
= time
.time()
619 end_time
= start_time
+ timeout
621 if end_time
and (time
.time() >= end_time
):
622 logger
.info(desc
+ ':timeout after %s seconds' % timeout
)
625 if reads
and process
.poll() is not None:
626 # we want to stop, but first read off anything remaining
630 reads
, _
, _
= select
.select(
631 [process
.stdout
.fileno(), process
.stderr
.fileno()],
636 message_b
= os
.read(fd
, 1024)
637 if isinstance(message_b
, bytes
):
638 message
= message_b
.decode('utf-8')
639 if isinstance(message_b
, str):
641 if fd
== process
.stdout
.fileno():
643 message
= out_buffer
+ message
644 lines
= message
.split('\n')
645 out_buffer
= lines
.pop()
648 logger
.info(desc
+ ':stdout ' + line
)
650 logger
.debug(desc
+ ':stdout ' + line
)
651 elif fd
== process
.stderr
.fileno():
653 message
= err_buffer
+ message
654 lines
= message
.split('\n')
655 err_buffer
= lines
.pop()
658 logger
.info(desc
+ ':stderr ' + line
)
660 logger
.debug(desc
+ ':stderr ' + line
)
663 except (IOError, OSError):
666 returncode
= process
.wait()
670 logger
.info(desc
+ ':stdout ' + out_buffer
)
672 logger
.debug(desc
+ ':stdout ' + out_buffer
)
675 logger
.info(desc
+ ':stderr ' + err_buffer
)
677 logger
.debug(desc
+ ':stderr ' + err_buffer
)
679 if returncode
!= 0 and verbose_on_failure
and not verbose
:
680 # dump stdout + stderr
681 logger
.info('Non-zero exit code %d from %s' % (returncode
, ' '.join(command
)))
682 for line
in out
.splitlines():
683 logger
.info(desc
+ ':stdout ' + line
)
684 for line
in err
.splitlines():
685 logger
.info(desc
+ ':stderr ' + line
)
687 return out
, err
, returncode
690 def call_throws(command
, **kwargs
):
691 # type: (List[str], Any) -> Tuple[str, str, int]
692 out
, err
, ret
= call(command
, **kwargs
)
694 raise RuntimeError('Failed command: %s' % ' '.join(command
))
698 def call_timeout(command
, timeout
):
699 # type: (List[str], int) -> int
701 logger
.debug('Running command (timeout=%s): %s'
702 % (timeout
, ' '.join(command
)))
704 def raise_timeout(command
, timeout
):
705 # type: (List[str], int) -> NoReturn
706 msg
= 'Command \'%s\' timed out after %s seconds' % (command
, timeout
)
708 raise TimeoutExpired(msg
)
710 def call_timeout_py2(command
, timeout
):
711 # type: (List[str], int) -> int
712 proc
= subprocess
.Popen(command
)
713 thread
= Thread(target
=proc
.wait
)
716 if thread
.is_alive():
719 raise_timeout(command
, timeout
)
720 return proc
.returncode
722 def call_timeout_py3(command
, timeout
):
723 # type: (List[str], int) -> int
725 return subprocess
.call(command
, timeout
=timeout
)
726 except subprocess
.TimeoutExpired
as e
:
727 raise_timeout(command
, timeout
)
730 if sys
.version_info
>= (3, 3):
731 ret
= call_timeout_py3(command
, timeout
)
733 # py2 subprocess has no timeout arg
734 ret
= call_timeout_py2(command
, timeout
)
737 ##################################
739 def is_available(what
, func
):
740 # type: (str, Callable[[], bool]) -> None
742 Wait for a service to become available
744 :param what: the name of the service
745 :param func: the callable object that determines availability
748 logger
.info('Waiting for %s...' % (what
))
754 raise Error('%s not available after %s tries'
757 logger
.info('%s not available, waiting (%s/%s)...'
758 % (what
, num
, retry
))
765 # type: (Optional[str]) -> ConfigParser
766 # bend over backwards here because py2's ConfigParser doesn't like
767 # whitespace before config option names (e.g., '\n foo = bar\n').
769 if sys
.version_info
>= (3, 2):
772 cp
= SafeConfigParser()
775 with
open(fn
, 'r') as f
:
777 nice_conf
= re
.sub(r
'\n(\s)+', r
'\n', raw_conf
)
778 s_io
= StringIO(nice_conf
)
779 if sys
.version_info
>= (3, 2):
788 if not p
.startswith('/'):
789 return os
.path
.join(os
.getcwd(), p
)
792 def get_file_timestamp(fn
):
794 mt
= os
.path
.getmtime(fn
)
795 return datetime
.datetime
.fromtimestamp(
796 mt
, tz
=datetime
.timezone
.utc
798 except Exception as e
:
801 def try_convert_datetime(s
):
802 # This is super irritating because
803 # 1) podman and docker use different formats
804 # 2) python's strptime can't parse either one
807 # docker 18.09.7: 2020-03-03T09:21:43.636153304Z
808 # podman 1.7.0: 2020-03-03T15:52:30.136257504-06:00
809 # 2020-03-03 15:52:30.136257504 -0600 CST
810 # (In the podman case, there is a different string format for
811 # 'inspect' and 'inspect --format {{.Created}}'!!)
813 # In *all* cases, the 9 digit second precision is too much for
814 # python's strptime. Shorten it to 6 digits.
815 p
= re
.compile(r
'(\.[\d]{6})[\d]*')
818 # replace trailling Z with -0000, since (on python 3.6.8) it won't parse
819 if s
and s
[-1] == 'Z':
822 # cut off the redundnat 'CST' part that strptime can't parse, if
827 # try parsing with several format strings
829 '%Y-%m-%dT%H:%M:%S.%f%z',
830 '%Y-%m-%d %H:%M:%S.%f %z',
834 # return timestamp normalized to UTC, rendered as DATEFMT.
835 return datetime
.datetime
.strptime(s
, f
).astimezone(tz
=datetime
.timezone
.utc
).strftime(DATEFMT
)
840 def get_podman_version():
841 # type: () -> Tuple[int, ...]
842 if 'podman' not in container_path
:
843 raise ValueError('not using podman')
844 out
, _
, _
= call_throws([container_path
, '--version'])
845 return _parse_podman_version(out
)
847 def _parse_podman_version(out
):
848 # type: (str) -> Tuple[int, ...]
849 _
, _
, version_str
= out
.strip().split()
851 def to_int(val
, org_e
=None):
852 if not val
and org_e
:
856 except ValueError as e
:
857 return to_int(val
[0:-1], org_e
or e
)
859 return tuple(map(to_int
, version_str
.split('.')))
864 return socket
.gethostname()
868 return socket
.getfqdn() or socket
.gethostname()
872 return platform
.uname().machine
874 def generate_service_id():
876 return get_hostname() + '.' + ''.join(random
.choice(string
.ascii_lowercase
)
879 def generate_password():
881 return ''.join(random
.choice(string
.ascii_lowercase
+ string
.digits
)
884 def normalize_container_id(i
):
886 # docker adds the sha256: prefix, but AFAICS both
887 # docker (18.09.7 in bionic at least) and podman
888 # both always use sha256, so leave off the prefix
891 if i
.startswith(prefix
):
897 return str(uuid
.uuid1())
900 # type: (str) -> bool
907 def infer_fsid(func
):
909 If we only find a single fsid in /var/lib/ceph/*, use that
914 logger
.debug('Using specified fsid: %s' % args
.fsid
)
918 daemon_list
= list_daemons(detail
=False)
919 for daemon
in daemon_list
:
920 if 'name' not in args
or not args
.name
:
921 fsids
.add(daemon
['fsid'])
922 elif daemon
['name'] == args
.name
:
923 fsids
.add(daemon
['fsid'])
927 # some commands do not always require an fsid
929 elif len(fsids
) == 1:
930 logger
.info('Inferring fsid %s' % fsids
[0])
933 raise Error('Cannot infer an fsid, one must be specified: %s' % fsids
)
938 def infer_image(func
):
940 Use the most recent ceph image
945 args
.image
= os
.environ
.get('CEPHADM_IMAGE')
947 args
.image
= get_last_local_ceph_image()
949 args
.image
= DEFAULT_IMAGE
954 def default_image(func
):
956 def _default_image():
958 if 'name' in args
and args
.name
:
959 type_
= args
.name
.split('.', 1)[0]
960 if type_
in Monitoring
.components
:
961 args
.image
= Monitoring
.components
[type_
]['image']
963 args
.image
= os
.environ
.get('CEPHADM_IMAGE')
965 args
.image
= DEFAULT_IMAGE
968 return _default_image
970 def get_last_local_ceph_image():
972 :return: The most recent local ceph image (already pulled)
974 out
, _
, _
= call_throws(
975 [container_path
, 'images',
976 '--filter', 'label=ceph=True',
977 '--format', '{{.Repository}} {{.Tag}}'])
978 out_lines
= out
.splitlines()
979 if len(out_lines
) > 0:
980 repository
, tag
= out_lines
[0].split()
981 r
= '{}:{}'.format(repository
, tag
)
982 logger
.info('Using recent ceph image %s' % r
)
986 def write_tmp(s
, uid
, gid
):
987 tmp_f
= tempfile
.NamedTemporaryFile(mode
='w',
989 os
.fchown(tmp_f
.fileno(), uid
, gid
)
995 def makedirs(dir, uid
, gid
, mode
):
996 # type: (str, int, int, int) -> None
997 if not os
.path
.exists(dir):
998 os
.makedirs(dir, mode
=mode
)
1001 os
.chown(dir, uid
, gid
)
1002 os
.chmod(dir, mode
) # the above is masked by umask...
1004 def get_data_dir(fsid
, t
, n
):
1005 # type: (str, str, Union[int, str]) -> str
1006 return os
.path
.join(args
.data_dir
, fsid
, '%s.%s' % (t
, n
))
1008 def get_log_dir(fsid
):
1009 # type: (str) -> str
1010 return os
.path
.join(args
.log_dir
, fsid
)
1012 def make_data_dir_base(fsid
, uid
, gid
):
1013 # type: (str, int, int) -> str
1014 data_dir_base
= os
.path
.join(args
.data_dir
, fsid
)
1015 makedirs(data_dir_base
, uid
, gid
, DATA_DIR_MODE
)
1016 makedirs(os
.path
.join(data_dir_base
, 'crash'), uid
, gid
, DATA_DIR_MODE
)
1017 makedirs(os
.path
.join(data_dir_base
, 'crash', 'posted'), uid
, gid
,
1019 return data_dir_base
1021 def make_data_dir(fsid
, daemon_type
, daemon_id
, uid
=None, gid
=None):
1022 # type: (str, str, Union[int, str], int, int) -> str
1023 if not uid
or not gid
:
1024 (uid
, gid
) = extract_uid_gid()
1025 make_data_dir_base(fsid
, uid
, gid
)
1026 data_dir
= get_data_dir(fsid
, daemon_type
, daemon_id
)
1027 makedirs(data_dir
, uid
, gid
, DATA_DIR_MODE
)
1030 def make_log_dir(fsid
, uid
=None, gid
=None):
1031 # type: (str, int, int) -> str
1032 if not uid
or not gid
:
1033 (uid
, gid
) = extract_uid_gid()
1034 log_dir
= get_log_dir(fsid
)
1035 makedirs(log_dir
, uid
, gid
, LOG_DIR_MODE
)
1038 def make_var_run(fsid
, uid
, gid
):
1039 # type: (str, int, int) -> None
1040 call_throws(['install', '-d', '-m0770', '-o', str(uid
), '-g', str(gid
),
1041 '/var/run/ceph/%s' % fsid
])
1043 def copy_tree(src
, dst
, uid
=None, gid
=None):
1044 # type: (List[str], str, int, int) -> None
1046 Copy a directory tree from src to dst
1048 if not uid
or not gid
:
1049 (uid
, gid
) = extract_uid_gid()
1053 if os
.path
.isdir(dst
):
1054 dst_dir
= os
.path
.join(dst
, os
.path
.basename(src_dir
))
1056 logger
.debug('copy directory \'%s\' -> \'%s\'' % (src_dir
, dst_dir
))
1057 shutil
.rmtree(dst_dir
, ignore_errors
=True)
1058 shutil
.copytree(src_dir
, dst_dir
) # dirs_exist_ok needs python 3.8
1060 for dirpath
, dirnames
, filenames
in os
.walk(dst_dir
):
1061 logger
.debug('chown %s:%s \'%s\'' % (uid
, gid
, dirpath
))
1062 os
.chown(dirpath
, uid
, gid
)
1063 for filename
in filenames
:
1064 logger
.debug('chown %s:%s \'%s\'' % (uid
, gid
, filename
))
1065 os
.chown(os
.path
.join(dirpath
, filename
), uid
, gid
)
1068 def copy_files(src
, dst
, uid
=None, gid
=None):
1069 # type: (List[str], str, int, int) -> None
1071 Copy a files from src to dst
1073 if not uid
or not gid
:
1074 (uid
, gid
) = extract_uid_gid()
1076 for src_file
in src
:
1078 if os
.path
.isdir(dst
):
1079 dst_file
= os
.path
.join(dst
, os
.path
.basename(src_file
))
1081 logger
.debug('copy file \'%s\' -> \'%s\'' % (src_file
, dst_file
))
1082 shutil
.copyfile(src_file
, dst_file
)
1084 logger
.debug('chown %s:%s \'%s\'' % (uid
, gid
, dst_file
))
1085 os
.chown(dst_file
, uid
, gid
)
1087 def move_files(src
, dst
, uid
=None, gid
=None):
1088 # type: (List[str], str, int, int) -> None
1090 Move files from src to dst
1092 if not uid
or not gid
:
1093 (uid
, gid
) = extract_uid_gid()
1095 for src_file
in src
:
1097 if os
.path
.isdir(dst
):
1098 dst_file
= os
.path
.join(dst
, os
.path
.basename(src_file
))
1100 if os
.path
.islink(src_file
):
1101 # shutil.move() in py2 does not handle symlinks correctly
1102 src_rl
= os
.readlink(src_file
)
1103 logger
.debug("symlink '%s' -> '%s'" % (dst_file
, src_rl
))
1104 os
.symlink(src_rl
, dst_file
)
1107 logger
.debug("move file '%s' -> '%s'" % (src_file
, dst_file
))
1108 shutil
.move(src_file
, dst_file
)
1109 logger
.debug('chown %s:%s \'%s\'' % (uid
, gid
, dst_file
))
1110 os
.chown(dst_file
, uid
, gid
)
1112 ## copied from distutils ##
1113 def find_executable(executable
, path
=None):
1114 """Tries to find 'executable' in the directories listed in 'path'.
1115 A string listing directories separated by 'os.pathsep'; defaults to
1116 os.environ['PATH']. Returns the complete filename or None if not found.
1118 _
, ext
= os
.path
.splitext(executable
)
1119 if (sys
.platform
== 'win32') and (ext
!= '.exe'):
1120 executable
= executable
+ '.exe'
1122 if os
.path
.isfile(executable
):
1126 path
= os
.environ
.get('PATH', None)
1129 path
= os
.confstr("CS_PATH")
1130 except (AttributeError, ValueError):
1131 # os.confstr() or CS_PATH is not available
1133 # bpo-35755: Don't use os.defpath if the PATH environment variable is
1134 # set to an empty string
1136 # PATH='' doesn't match, whereas PATH=':' looks in the current directory
1140 paths
= path
.split(os
.pathsep
)
1142 f
= os
.path
.join(p
, executable
)
1143 if os
.path
.isfile(f
):
1144 # the file exists, we have a shot at spawn working
1148 def find_program(filename
):
1149 # type: (str) -> str
1150 name
= find_executable(filename
)
1152 raise ValueError('%s not found' % filename
)
1155 def get_unit_name(fsid
, daemon_type
, daemon_id
=None):
1156 # type: (str, str, Optional[Union[int, str]]) -> str
1157 # accept either name or type + id
1158 if daemon_id
is not None:
1159 return 'ceph-%s@%s.%s' % (fsid
, daemon_type
, daemon_id
)
1161 return 'ceph-%s@%s' % (fsid
, daemon_type
)
1163 def check_unit(unit_name
):
1164 # type: (str) -> Tuple[bool, str, bool]
1165 # NOTE: we ignore the exit code here because systemctl outputs
1166 # various exit codes based on the state of the service, but the
1167 # string result is more explicit (and sufficient).
1171 out
, err
, code
= call(['systemctl', 'is-enabled', unit_name
],
1172 verbose_on_failure
=False)
1176 elif "disabled" in out
:
1178 except Exception as e
:
1179 logger
.warning('unable to run systemctl: %s' % e
)
1185 out
, err
, code
= call(['systemctl', 'is-active', unit_name
],
1186 verbose_on_failure
=False)
1188 if out
in ['active']:
1190 elif out
in ['inactive']:
1192 elif out
in ['failed', 'auto-restart']:
1196 except Exception as e
:
1197 logger
.warning('unable to run systemctl: %s' % e
)
1199 return (enabled
, state
, installed
)
1201 def check_units(units
, enabler
=None):
1202 # type: (List[str], Optional[Packager]) -> bool
1204 (enabled
, state
, installed
) = check_unit(u
)
1205 if enabled
and state
== 'running':
1206 logger
.info('Unit %s is enabled and running' % u
)
1208 if enabler
is not None:
1210 logger
.info('Enabling unit %s' % u
)
1211 enabler
.enable_service(u
)
1214 def get_legacy_config_fsid(cluster
, legacy_dir
=None):
1215 # type: (str, str) -> Optional[str]
1216 config_file
= '/etc/ceph/%s.conf' % cluster
1217 if legacy_dir
is not None:
1218 config_file
= os
.path
.abspath(legacy_dir
+ config_file
)
1220 if os
.path
.exists(config_file
):
1221 config
= read_config(config_file
)
1222 if config
.has_section('global') and config
.has_option('global', 'fsid'):
1223 return config
.get('global', 'fsid')
1226 def get_legacy_daemon_fsid(cluster
, daemon_type
, daemon_id
, legacy_dir
=None):
1227 # type: (str, str, Union[int, str], str) -> Optional[str]
1229 if daemon_type
== 'osd':
1231 fsid_file
= os
.path
.join(args
.data_dir
,
1233 'ceph-%s' % daemon_id
,
1235 if legacy_dir
is not None:
1236 fsid_file
= os
.path
.abspath(legacy_dir
+ fsid_file
)
1237 with
open(fsid_file
, 'r') as f
:
1238 fsid
= f
.read().strip()
1242 fsid
= get_legacy_config_fsid(cluster
, legacy_dir
=legacy_dir
)
1245 def get_daemon_args(fsid
, daemon_type
, daemon_id
):
1246 # type: (str, str, Union[int, str]) -> List[str]
1247 r
= list() # type: List[str]
1249 if daemon_type
in Ceph
.daemons
and daemon_type
!= 'crash':
1251 '--setuser', 'ceph',
1252 '--setgroup', 'ceph',
1253 '--default-log-to-file=false',
1254 '--default-log-to-stderr=true',
1255 '--default-log-stderr-prefix="debug "',
1257 if daemon_type
== 'mon':
1259 '--default-mon-cluster-log-to-file=false',
1260 '--default-mon-cluster-log-to-stderr=true',
1262 elif daemon_type
in Monitoring
.components
:
1263 metadata
= Monitoring
.components
[daemon_type
]
1264 r
+= metadata
.get('args', list())
1265 if daemon_type
== 'alertmanager':
1266 config
= get_parm(args
.config_json
)
1267 peers
= config
.get('peers', list()) # type: ignore
1269 r
+= ["--cluster.peer={}".format(peer
)]
1270 elif daemon_type
== NFSGanesha
.daemon_type
:
1271 r
+= NFSGanesha
.daemon_args
1275 def create_daemon_dirs(fsid
, daemon_type
, daemon_id
, uid
, gid
,
1276 config
=None, keyring
=None,
1278 # type: (str, str, Union[int, str], int, int, Optional[str], Optional[str], Optional[bool]) -> None
1279 data_dir
= make_data_dir(fsid
, daemon_type
, daemon_id
, uid
=uid
, gid
=gid
)
1280 make_log_dir(fsid
, uid
=uid
, gid
=gid
)
1283 config_path
= os
.path
.join(data_dir
, 'config')
1284 with
open(config_path
, 'w') as f
:
1285 os
.fchown(f
.fileno(), uid
, gid
)
1286 os
.fchmod(f
.fileno(), 0o600)
1289 keyring_path
= os
.path
.join(data_dir
, 'keyring')
1290 with
open(keyring_path
, 'w') as f
:
1291 os
.fchmod(f
.fileno(), 0o600)
1292 os
.fchown(f
.fileno(), uid
, gid
)
1295 if daemon_type
in Monitoring
.components
.keys():
1296 config
= get_parm(args
.config_json
) # type: ignore
1297 required_files
= Monitoring
.components
[daemon_type
].get('config-json-files', list())
1299 # Set up directories specific to the monitoring component
1301 if daemon_type
== 'prometheus':
1302 data_dir_root
= get_data_dir(fsid
, daemon_type
, daemon_id
)
1303 config_dir
= 'etc/prometheus'
1304 makedirs(os
.path
.join(data_dir_root
, config_dir
), uid
, gid
, 0o755)
1305 makedirs(os
.path
.join(data_dir_root
, config_dir
, 'alerting'), uid
, gid
, 0o755)
1306 makedirs(os
.path
.join(data_dir_root
, 'data'), uid
, gid
, 0o755)
1307 elif daemon_type
== 'grafana':
1308 data_dir_root
= get_data_dir(fsid
, daemon_type
, daemon_id
)
1309 config_dir
= 'etc/grafana'
1310 makedirs(os
.path
.join(data_dir_root
, config_dir
), uid
, gid
, 0o755)
1311 makedirs(os
.path
.join(data_dir_root
, config_dir
, 'certs'), uid
, gid
, 0o755)
1312 makedirs(os
.path
.join(data_dir_root
, config_dir
, 'provisioning/datasources'), uid
, gid
, 0o755)
1313 makedirs(os
.path
.join(data_dir_root
, 'data'), uid
, gid
, 0o755)
1314 elif daemon_type
== 'alertmanager':
1315 data_dir_root
= get_data_dir(fsid
, daemon_type
, daemon_id
)
1316 config_dir
= 'etc/alertmanager'
1317 makedirs(os
.path
.join(data_dir_root
, config_dir
), uid
, gid
, 0o755)
1318 makedirs(os
.path
.join(data_dir_root
, config_dir
, 'data'), uid
, gid
, 0o755)
1321 # populate the config directory for the component from the config-json
1322 for fname
in required_files
:
1323 if 'files' in config
: # type: ignore
1324 if isinstance(config
['files'][fname
], list): # type: ignore
1325 content
= '\n'.join(config
['files'][fname
]) # type: ignore
1327 content
= config
['files'][fname
] # type: ignore
1329 with
open(os
.path
.join(data_dir_root
, config_dir
, fname
), 'w') as f
:
1330 os
.fchown(f
.fileno(), uid
, gid
)
1331 os
.fchmod(f
.fileno(), 0o600)
1334 if daemon_type
== NFSGanesha
.daemon_type
:
1335 nfs_ganesha
= NFSGanesha
.init(fsid
, daemon_id
)
1336 nfs_ganesha
.create_daemon_dirs(data_dir
, uid
, gid
)
1338 def get_parm(option
):
1339 # type: (str) -> Dict[str, str]
1346 if cached_stdin
is not None:
1350 j
= injected_stdin
# type: ignore
1352 j
= sys
.stdin
.read()
1355 # inline json string
1356 if option
[0] == '{' and option
[-1] == '}':
1359 elif os
.path
.exists(option
):
1360 with
open(option
, 'r') as f
:
1363 raise Error("Config file {} not found".format(option
))
1367 except ValueError as e
:
1368 raise Error("Invalid JSON in {}: {}".format(option
, e
))
1372 def get_config_and_keyring():
1373 # type: () -> Tuple[str, str]
1374 if 'config_json' in args
and args
.config_json
:
1375 d
= get_parm(args
.config_json
)
1376 config
= d
.get('config')
1377 keyring
= d
.get('keyring')
1379 if 'config' in args
and args
.config
:
1380 with
open(args
.config
, 'r') as f
:
1383 if 'key' in args
and args
.key
:
1384 keyring
= '[%s]\n\tkey = %s\n' % (args
.name
, args
.key
)
1385 elif 'keyring' in args
and args
.keyring
:
1386 with
open(args
.keyring
, 'r') as f
:
1390 raise Error('no config provided')
1392 raise Error('no keyring provided')
1394 return (config
, keyring
)
1396 def get_container_mounts(fsid
, daemon_type
, daemon_id
,
1398 # type: (str, str, Union[int, str, None], Optional[bool]) -> Dict[str, str]
1401 if daemon_type
in Ceph
.daemons
:
1403 run_path
= os
.path
.join('/var/run/ceph', fsid
);
1404 if os
.path
.exists(run_path
):
1405 mounts
[run_path
] = '/var/run/ceph:z'
1406 log_dir
= get_log_dir(fsid
)
1407 mounts
[log_dir
] = '/var/log/ceph:z'
1408 crash_dir
= '/var/lib/ceph/%s/crash' % fsid
1409 if os
.path
.exists(crash_dir
):
1410 mounts
[crash_dir
] = '/var/lib/ceph/crash:z'
1412 if daemon_type
in Ceph
.daemons
and daemon_id
:
1413 data_dir
= get_data_dir(fsid
, daemon_type
, daemon_id
)
1414 if daemon_type
== 'rgw':
1415 cdata_dir
= '/var/lib/ceph/radosgw/ceph-rgw.%s' % (daemon_id
)
1417 cdata_dir
= '/var/lib/ceph/%s/ceph-%s' % (daemon_type
, daemon_id
)
1418 if daemon_type
!= 'crash':
1419 mounts
[data_dir
] = cdata_dir
+ ':z'
1421 mounts
[data_dir
+ '/config'] = '/etc/ceph/ceph.conf:z'
1422 if daemon_type
== 'rbd-mirror' or daemon_type
== 'crash':
1423 # these do not search for their keyrings in a data directory
1424 mounts
[data_dir
+ '/keyring'] = '/etc/ceph/ceph.client.%s.%s.keyring' % (daemon_type
, daemon_id
)
1426 if daemon_type
in ['mon', 'osd']:
1427 mounts
['/dev'] = '/dev' # FIXME: narrow this down?
1428 mounts
['/run/udev'] = '/run/udev'
1429 if daemon_type
== 'osd':
1430 mounts
['/sys'] = '/sys' # for numa.cc, pick_address, cgroups, ...
1431 mounts
['/run/lvm'] = '/run/lvm'
1432 mounts
['/run/lock/lvm'] = '/run/lock/lvm'
1434 if daemon_type
in Monitoring
.components
and daemon_id
:
1435 data_dir
= get_data_dir(fsid
, daemon_type
, daemon_id
)
1436 if daemon_type
== 'prometheus':
1437 mounts
[os
.path
.join(data_dir
, 'etc/prometheus')] = '/etc/prometheus:Z'
1438 mounts
[os
.path
.join(data_dir
, 'data')] = '/prometheus:Z'
1439 elif daemon_type
== 'node-exporter':
1440 mounts
['/proc'] = '/host/proc:ro'
1441 mounts
['/sys'] = '/host/sys:ro'
1442 mounts
['/'] = '/rootfs:ro'
1443 elif daemon_type
== "grafana":
1444 mounts
[os
.path
.join(data_dir
, 'etc/grafana/grafana.ini')] = '/etc/grafana/grafana.ini:Z'
1445 mounts
[os
.path
.join(data_dir
, 'etc/grafana/provisioning/datasources')] = '/etc/grafana/provisioning/datasources:Z'
1446 mounts
[os
.path
.join(data_dir
, 'etc/grafana/certs')] = '/etc/grafana/certs:Z'
1447 elif daemon_type
== 'alertmanager':
1448 mounts
[os
.path
.join(data_dir
, 'etc/alertmanager')] = '/alertmanager:Z'
1450 if daemon_type
== NFSGanesha
.daemon_type
:
1452 data_dir
= get_data_dir(fsid
, daemon_type
, daemon_id
)
1453 mounts
.update(NFSGanesha
.get_container_mounts(data_dir
))
1457 def get_container(fsid
, daemon_type
, daemon_id
,
1461 # type: (str, str, Union[int, str], bool, bool, List[str]) -> CephContainer
1462 if daemon_type
in ['mon', 'osd']:
1463 # mon and osd need privileged in order for libudev to query devices
1465 if daemon_type
== 'rgw':
1466 entrypoint
= '/usr/bin/radosgw'
1467 name
= 'client.rgw.%s' % daemon_id
1468 elif daemon_type
== 'rbd-mirror':
1469 entrypoint
= '/usr/bin/rbd-mirror'
1470 name
= 'client.rbd-mirror.%s' % daemon_id
1471 elif daemon_type
== 'crash':
1472 entrypoint
= '/usr/bin/ceph-crash'
1473 name
= 'client.crash.%s' % daemon_id
1474 elif daemon_type
in ['mon', 'mgr', 'mds', 'osd']:
1475 entrypoint
= '/usr/bin/ceph-' + daemon_type
1476 name
= '%s.%s' % (daemon_type
, daemon_id
)
1477 elif daemon_type
in Monitoring
.components
:
1480 elif daemon_type
== NFSGanesha
.daemon_type
:
1481 entrypoint
= NFSGanesha
.entrypoint
1482 name
= '%s.%s' % (daemon_type
, daemon_id
)
1487 ceph_args
= [] # type: List[str]
1488 if daemon_type
in Monitoring
.components
:
1489 uid
, gid
= extract_uid_gid_monitoring(daemon_type
)
1490 m
= Monitoring
.components
[daemon_type
] # type: ignore
1491 metadata
= m
.get('image', dict()) # type: ignore
1495 # FIXME: disable cpu/memory limits for the time being (not supported
1496 # by ubuntu 18.04 kernel!)
1498 #metadata.get('cpus', '2'),
1500 #metadata.get('memory', '4GB')
1502 container_args
.extend(monitoring_args
)
1503 elif daemon_type
== 'crash':
1504 ceph_args
= ['-n', name
]
1505 elif daemon_type
in Ceph
.daemons
:
1506 ceph_args
= ['-n', name
, '-f']
1508 envs
=[] # type: List[str]
1509 if daemon_type
== NFSGanesha
.daemon_type
:
1510 envs
.extend(NFSGanesha
.get_container_envs())
1512 return CephContainer(
1514 entrypoint
=entrypoint
,
1515 args
=ceph_args
+ get_daemon_args(fsid
, daemon_type
, daemon_id
),
1516 container_args
=container_args
,
1517 volume_mounts
=get_container_mounts(fsid
, daemon_type
, daemon_id
),
1518 cname
='ceph-%s-%s.%s' % (fsid
, daemon_type
, daemon_id
),
1520 privileged
=privileged
,
1524 def extract_uid_gid(img
='', file_path
='/var/lib/ceph'):
1525 # type: (str, str) -> Tuple[int, int]
1530 out
= CephContainer(
1533 args
=['-c', '%u %g', file_path
]
1535 (uid
, gid
) = out
.split(' ')
1536 return (int(uid
), int(gid
))
1538 def deploy_daemon(fsid
, daemon_type
, daemon_id
, c
, uid
, gid
,
1539 config
=None, keyring
=None,
1542 # type: (str, str, Union[int, str], CephContainer, int, int, Optional[str], Optional[str], Optional[str], Optional[bool]) -> None
1543 data_dir
= get_data_dir(fsid
, daemon_type
, daemon_id
)
1544 if reconfig
and not os
.path
.exists(data_dir
):
1545 raise Error('cannot reconfig, data path %s does not exist' % data_dir
)
1546 if daemon_type
== 'mon' and not os
.path
.exists(data_dir
):
1550 tmp_keyring
= write_tmp(keyring
, uid
, gid
)
1553 tmp_config
= write_tmp(config
, uid
, gid
)
1556 create_daemon_dirs(fsid
, daemon_type
, daemon_id
, uid
, gid
)
1557 mon_dir
= get_data_dir(fsid
, 'mon', daemon_id
)
1558 log_dir
= get_log_dir(fsid
)
1559 out
= CephContainer(
1561 entrypoint
='/usr/bin/ceph-mon',
1563 '-i', str(daemon_id
),
1565 '-c', '/tmp/config',
1566 '--keyring', '/tmp/keyring',
1567 ] + get_daemon_args(fsid
, 'mon', daemon_id
),
1569 log_dir
: '/var/log/ceph:z',
1570 mon_dir
: '/var/lib/ceph/mon/ceph-%s:z' % (daemon_id
),
1571 tmp_keyring
.name
: '/tmp/keyring:z',
1572 tmp_config
.name
: '/tmp/config:z',
1577 with
open(mon_dir
+ '/config', 'w') as f
:
1578 os
.fchown(f
.fileno(), uid
, gid
)
1579 os
.fchmod(f
.fileno(), 0o600)
1582 # dirs, conf, keyring
1584 fsid
, daemon_type
, daemon_id
,
1589 deploy_daemon_units(fsid
, uid
, gid
, daemon_type
, daemon_id
, c
,
1592 if not os
.path
.exists(data_dir
+ '/unit.created'):
1593 with
open(data_dir
+ '/unit.created', 'w') as f
:
1594 os
.fchmod(f
.fileno(), 0o600)
1595 os
.fchown(f
.fileno(), uid
, gid
)
1596 f
.write('mtime is time the daemon deployment was created\n')
1598 with
open(data_dir
+ '/unit.configured', 'w') as f
:
1599 f
.write('mtime is time we were last configured\n')
1600 os
.fchmod(f
.fileno(), 0o600)
1601 os
.fchown(f
.fileno(), uid
, gid
)
1603 update_firewalld(daemon_type
)
1605 if reconfig
and daemon_type
not in Ceph
.daemons
:
1606 # ceph daemons do not need a restart; others (presumably) do to pick
1608 call_throws(['systemctl', 'reset-failed',
1609 get_unit_name(fsid
, daemon_type
, daemon_id
)])
1610 call_throws(['systemctl', 'restart',
1611 get_unit_name(fsid
, daemon_type
, daemon_id
)])
1613 def deploy_daemon_units(fsid
, uid
, gid
, daemon_type
, daemon_id
, c
,
1614 enable
=True, start
=True,
1616 # type: (str, int, int, str, Union[int, str], CephContainer, bool, bool, Optional[str]) -> None
1618 data_dir
= get_data_dir(fsid
, daemon_type
, daemon_id
)
1619 with
open(data_dir
+ '/unit.run.new', 'w') as f
:
1621 if daemon_type
== 'osd':
1622 # osds have a pre-start step
1624 prestart
= CephContainer(
1626 entrypoint
='/usr/sbin/ceph-volume',
1629 str(daemon_id
), osd_fsid
,
1633 volume_mounts
=get_container_mounts(fsid
, daemon_type
, daemon_id
),
1634 cname
='ceph-%s-%s.%s-activate' % (fsid
, daemon_type
, daemon_id
),
1636 f
.write(' '.join(prestart
.run_cmd()) + '\n')
1637 elif daemon_type
== NFSGanesha
.daemon_type
:
1638 # add nfs to the rados grace db
1639 nfs_ganesha
= NFSGanesha
.init(fsid
, daemon_id
)
1640 prestart
= nfs_ganesha
.get_rados_grace_container('add')
1641 f
.write(' '.join(prestart
.run_cmd()) + '\n')
1643 # container run command
1644 f
.write(' '.join(c
.run_cmd()) + '\n')
1645 os
.fchmod(f
.fileno(), 0o600)
1646 os
.rename(data_dir
+ '/unit.run.new',
1647 data_dir
+ '/unit.run')
1649 # post-stop command(s)
1650 with
open(data_dir
+ '/unit.poststop.new', 'w') as f
:
1651 if daemon_type
== 'osd':
1653 poststop
= CephContainer(
1655 entrypoint
='/usr/sbin/ceph-volume',
1657 'lvm', 'deactivate',
1658 str(daemon_id
), osd_fsid
,
1661 volume_mounts
=get_container_mounts(fsid
, daemon_type
, daemon_id
),
1662 cname
='ceph-%s-%s.%s-deactivate' % (fsid
, daemon_type
,
1665 f
.write(' '.join(poststop
.run_cmd()) + '\n')
1666 elif daemon_type
== NFSGanesha
.daemon_type
:
1667 # remove nfs from the rados grace db
1668 nfs_ganesha
= NFSGanesha
.init(fsid
, daemon_id
)
1669 poststop
= nfs_ganesha
.get_rados_grace_container('remove')
1670 f
.write(' '.join(poststop
.run_cmd()) + '\n')
1671 os
.fchmod(f
.fileno(), 0o600)
1672 os
.rename(data_dir
+ '/unit.poststop.new',
1673 data_dir
+ '/unit.poststop')
1675 with
open(data_dir
+ '/unit.image.new', 'w') as f
:
1676 f
.write(c
.image
+ '\n')
1677 os
.fchmod(f
.fileno(), 0o600)
1678 os
.rename(data_dir
+ '/unit.image.new',
1679 data_dir
+ '/unit.image')
1682 install_base_units(fsid
)
1683 unit
= get_unit_file(fsid
, uid
, gid
)
1684 unit_file
= 'ceph-%s@.service' % (fsid
)
1685 with
open(args
.unit_dir
+ '/' + unit_file
+ '.new', 'w') as f
:
1687 os
.rename(args
.unit_dir
+ '/' + unit_file
+ '.new',
1688 args
.unit_dir
+ '/' + unit_file
)
1689 call_throws(['systemctl', 'daemon-reload'])
1691 unit_name
= get_unit_name(fsid
, daemon_type
, daemon_id
)
1692 call(['systemctl', 'stop', unit_name
],
1693 verbose_on_failure
=False)
1694 call(['systemctl', 'reset-failed', unit_name
],
1695 verbose_on_failure
=False)
1697 call_throws(['systemctl', 'enable', unit_name
])
1699 call_throws(['systemctl', 'start', unit_name
])
1701 def update_firewalld(daemon_type
):
1702 # type: (str) -> None
1703 if args
.skip_firewalld
:
1705 cmd
= find_executable('firewall-cmd')
1707 logger
.debug('firewalld does not appear to be present')
1709 (enabled
, state
, _
) = check_unit('firewalld.service')
1711 logger
.debug('firewalld.service is not enabled')
1716 if daemon_type
== 'mon':
1717 fw_services
.append('ceph-mon')
1718 elif daemon_type
in ['mgr', 'mds', 'osd']:
1719 fw_services
.append('ceph')
1720 if daemon_type
== 'mgr':
1721 fw_ports
.append(8080) # dashboard
1722 fw_ports
.append(8443) # dashboard
1723 fw_ports
.append(9283) # mgr/prometheus exporter
1724 elif daemon_type
in Monitoring
.port_map
.keys():
1725 fw_ports
.extend(Monitoring
.port_map
[daemon_type
]) # prometheus etc
1726 elif daemon_type
== NFSGanesha
.daemon_type
:
1727 fw_services
.append('nfs')
1729 for svc
in fw_services
:
1730 out
, err
, ret
= call([cmd
, '--permanent', '--query-service', svc
])
1732 logger
.info('Enabling firewalld service %s in current zone...' % svc
)
1733 out
, err
, ret
= call([cmd
, '--permanent', '--add-service', svc
])
1736 'unable to add service %s to current zone: %s' % (svc
, err
))
1738 logger
.debug('firewalld service %s is enabled in current zone' % svc
)
1739 for port
in fw_ports
:
1740 tcp_port
= str(port
) + '/tcp'
1741 out
, err
, ret
= call([cmd
, '--permanent', '--query-port', tcp_port
])
1743 logger
.info('Enabling firewalld port %s in current zone...' % tcp_port
)
1744 out
, err
, ret
= call([cmd
, '--permanent', '--add-port', tcp_port
])
1746 raise RuntimeError('unable to add port %s to current zone: %s' %
1749 logger
.debug('firewalld port %s is enabled in current zone' % tcp_port
)
1750 call_throws([cmd
, '--reload'])
1752 def install_base_units(fsid
):
1753 # type: (str) -> None
1755 Set up ceph.target and ceph-$fsid.target units.
1758 existed
= os
.path
.exists(args
.unit_dir
+ '/ceph.target')
1759 with
open(args
.unit_dir
+ '/ceph.target.new', 'w') as f
:
1761 'Description=All Ceph clusters and services\n'
1764 'WantedBy=multi-user.target\n')
1765 os
.rename(args
.unit_dir
+ '/ceph.target.new',
1766 args
.unit_dir
+ '/ceph.target')
1768 # we disable before enable in case a different ceph.target
1769 # (from the traditional package) is present; while newer
1770 # systemd is smart enough to disable the old
1771 # (/lib/systemd/...) and enable the new (/etc/systemd/...),
1772 # some older versions of systemd error out with EEXIST.
1773 call_throws(['systemctl', 'disable', 'ceph.target'])
1774 call_throws(['systemctl', 'enable', 'ceph.target'])
1775 call_throws(['systemctl', 'start', 'ceph.target'])
1778 existed
= os
.path
.exists(args
.unit_dir
+ '/ceph-%s.target' % fsid
)
1779 with
open(args
.unit_dir
+ '/ceph-%s.target.new' % fsid
, 'w') as f
:
1781 'Description=Ceph cluster {fsid}\n'
1782 'PartOf=ceph.target\n'
1783 'Before=ceph.target\n'
1786 'WantedBy=multi-user.target ceph.target\n'.format(
1789 os
.rename(args
.unit_dir
+ '/ceph-%s.target.new' % fsid
,
1790 args
.unit_dir
+ '/ceph-%s.target' % fsid
)
1792 call_throws(['systemctl', 'enable', 'ceph-%s.target' % fsid
])
1793 call_throws(['systemctl', 'start', 'ceph-%s.target' % fsid
])
1795 # logrotate for the cluster
1796 with
open(args
.logrotate_dir
+ '/ceph-%s' % fsid
, 'w') as f
:
1798 This is a bit sloppy in that the killall/pkill will touch all ceph daemons
1799 in all containers, but I don't see an elegant way to send SIGHUP *just* to
1800 the daemons for this cluster. (1) systemd kill -s will get the signal to
1801 podman, but podman will exit. (2) podman kill will get the signal to the
1802 first child (bash), but that isn't the ceph daemon. This is simpler and
1805 f
.write("""# created by cephadm
1806 /var/log/ceph/%s/*.log {
1812 killall -q -1 ceph-mon ceph-mgr ceph-mds ceph-osd ceph-fuse radosgw rbd-mirror || pkill -1 -x "ceph-mon|ceph-mgr|ceph-mds|ceph-osd|ceph-fuse|radosgw|rbd-mirror" || true
1820 def get_unit_file(fsid
, uid
, gid
):
1821 # type: (str, int, int) -> str
1822 install_path
= find_program('install')
1823 u
= """# generated by cephadm
1825 Description=Ceph %i for {fsid}
1828 # http://www.freedesktop.org/wiki/Software/systemd/NetworkTarget
1829 # these can be removed once ceph-mon will dynamically change network
1831 After=network-online.target local-fs.target time-sync.target
1832 Wants=network-online.target local-fs.target time-sync.target
1834 PartOf=ceph-{fsid}.target
1835 Before=ceph-{fsid}.target
1840 EnvironmentFile=-/etc/environment
1841 ExecStartPre=-{container_path} rm ceph-{fsid}-%i
1842 ExecStartPre=-{install_path} -d -m0770 -o {uid} -g {gid} /var/run/ceph/{fsid}
1843 ExecStart=/bin/bash {data_dir}/{fsid}/%i/unit.run
1844 ExecStop=-{container_path} stop ceph-{fsid}-%i
1845 ExecStopPost=-/bin/bash {data_dir}/{fsid}/%i/unit.poststop
1851 StartLimitInterval=30min
1855 WantedBy=ceph-{fsid}.target
1857 container_path
=container_path
,
1858 install_path
=install_path
,
1862 data_dir
=args
.data_dir
)
1865 ##################################
1867 class CephContainer
:
1878 # type: (str, str, List[str], Dict[str, str], str, List[str], Optional[List[str]], bool, bool) -> None
1880 self
.entrypoint
= entrypoint
1882 self
.volume_mounts
= volume_mounts
1884 self
.container_args
= container_args
1886 self
.privileged
= privileged
1887 self
.ptrace
= ptrace
1890 # type: () -> List[str]
1891 vols
= [] # type: List[str]
1892 envs
= [] # type: List[str]
1893 cname
= [] # type: List[str]
1894 entrypoint
= [] # type: List[str]
1896 entrypoint
= ['--entrypoint', self
.entrypoint
]
1898 priv
= [] # type: List[str]
1900 priv
= ['--privileged',
1901 # let OSD etc read block devs that haven't been chowned
1904 priv
.append('--cap-add=SYS_PTRACE')
1906 [['-v', '%s:%s' % (host_dir
, container_dir
)]
1907 for host_dir
, container_dir
in self
.volume_mounts
.items()], [])
1909 '-e', 'CONTAINER_IMAGE=%s' % self
.image
,
1910 '-e', 'NODE_NAME=%s' % get_hostname(),
1914 envs
.extend(['-e', e
])
1915 cname
= ['--name', self
.cname
] if self
.cname
else []
1917 str(container_path
),
1921 ] + self
.container_args
+ priv
+ \
1923 vols
+ entrypoint
+ \
1926 ] + self
.args
# type: ignore
1928 def shell_cmd(self
, cmd
):
1929 # type: (List[str]) -> List[str]
1930 priv
= [] # type: List[str]
1932 priv
= ['--privileged',
1933 # let OSD etc read block devs that haven't been chowned
1935 vols
= [] # type: List[str]
1937 [['-v', '%s:%s' % (host_dir
, container_dir
)]
1938 for host_dir
, container_dir
in self
.volume_mounts
.items()], [])
1940 '-e', 'CONTAINER_IMAGE=%s' % self
.image
,
1941 '-e', 'NODE_NAME=%s' % get_hostname(),
1945 envs
.extend(['-e', e
])
1946 cmd_args
= [] # type: List[str]
1948 cmd_args
= ['-c'] + cmd
1950 str(container_path
),
1954 ] + self
.container_args
+ priv
+ envs
+ vols
+ [
1955 '--entrypoint', cmd
[0],
1959 def exec_cmd(self
, cmd
):
1960 # type: (List[str]) -> List[str]
1962 str(container_path
),
1964 ] + self
.container_args
+ [
1968 def run(self
, timeout
=DEFAULT_TIMEOUT
):
1969 # type: (Optional[int]) -> str
1970 logger
.debug(self
.run_cmd())
1971 out
, _
, _
= call_throws(
1972 self
.run_cmd(), desc
=self
.entrypoint
, timeout
=timeout
)
1975 ##################################
1978 def command_version():
1980 out
= CephContainer(args
.image
, 'ceph', ['--version']).run()
1984 ##################################
1989 logger
.info('Pulling latest %s...' % args
.image
)
1990 call_throws([container_path
, 'pull', args
.image
])
1991 return command_inspect_image()
1993 ##################################
1996 def command_inspect_image():
1998 out
, err
, ret
= call_throws([
1999 container_path
, 'inspect',
2000 '--format', '{{.Id}}',
2004 image_id
= normalize_container_id(out
.strip())
2005 ver
= CephContainer(args
.image
, 'ceph', ['--version']).run().strip()
2007 'image_id': image_id
,
2008 'ceph_version': ver
,
2010 print(json
.dumps(r
, indent
=4, sort_keys
=True))
2013 ##################################
2016 def command_bootstrap():
2019 if not args
.output_config
:
2020 args
.output_config
= os
.path
.join(args
.output_dir
, 'ceph.conf')
2021 if not args
.output_keyring
:
2022 args
.output_keyring
= os
.path
.join(args
.output_dir
,
2023 'ceph.client.admin.keyring')
2024 if not args
.output_pub_ssh_key
:
2025 args
.output_pub_ssh_key
= os
.path
.join(args
.output_dir
, 'ceph.pub')
2027 # verify output files
2028 for f
in [args
.output_config
, args
.output_keyring
, args
.output_pub_ssh_key
]:
2029 if not args
.allow_overwrite
:
2030 if os
.path
.exists(f
):
2031 raise Error('%s already exists; delete or pass '
2032 '--allow-overwrite to overwrite' % f
)
2033 dirname
= os
.path
.dirname(f
)
2034 if dirname
and not os
.path
.exists(dirname
):
2035 raise Error('%s directory %s does not exist' % (f
, dirname
))
2037 if not args
.skip_prepare_host
:
2038 command_prepare_host()
2040 logger
.info('Skip prepare_host')
2043 fsid
= args
.fsid
or make_fsid()
2044 hostname
= get_hostname()
2045 if '.' in hostname
and not args
.allow_fqdn_hostname
:
2046 raise Error('hostname is a fully qualified domain name (%s); either fix (e.g., "sudo hostname %s" or similar) or pass --allow-fqdn-hostname' % (hostname
, hostname
.split('.')[0]))
2047 mon_id
= args
.mon_id
or hostname
2048 mgr_id
= args
.mgr_id
or generate_service_id()
2049 logging
.info('Cluster fsid: %s' % fsid
)
2055 r
= re
.compile(r
':(\d+)$')
2058 hasport
= r
.findall(args
.mon_ip
)
2060 port
= int(hasport
[0])
2062 addr_arg
= '[v1:%s]' % args
.mon_ip
2064 addr_arg
= '[v2:%s]' % args
.mon_ip
2066 logger
.warning('Using msgr2 protocol for unrecognized port %d' %
2068 addr_arg
= '[v2:%s]' % args
.mon_ip
2069 base_ip
= args
.mon_ip
[0:-(len(str(port
)))-1]
2070 check_ip_port(base_ip
, port
)
2072 base_ip
= args
.mon_ip
2073 addr_arg
= '[v2:%s:3300,v1:%s:6789]' % (args
.mon_ip
, args
.mon_ip
)
2074 check_ip_port(args
.mon_ip
, 3300)
2075 check_ip_port(args
.mon_ip
, 6789)
2076 elif args
.mon_addrv
:
2077 addr_arg
= args
.mon_addrv
2078 if addr_arg
[0] != '[' or addr_arg
[-1] != ']':
2079 raise Error('--mon-addrv value %s must use square backets' %
2081 for addr
in addr_arg
[1:-1].split(','):
2082 hasport
= r
.findall(addr
)
2084 raise Error('--mon-addrv value %s must include port number' %
2086 port
= int(hasport
[0])
2087 # strip off v1: or v2: prefix
2088 addr
= re
.sub(r
'^\w+:', '', addr
)
2089 base_ip
= addr
[0:-(len(str(port
)))-1]
2090 check_ip_port(base_ip
, port
)
2092 raise Error('must specify --mon-ip or --mon-addrv')
2093 logger
.debug('Base mon IP is %s, final addrv is %s' % (base_ip
, addr_arg
))
2096 if not args
.skip_mon_network
:
2097 # make sure IP is configured locally, and then figure out the
2099 for net
, ips
in list_networks().items():
2102 logger
.info('Mon IP %s is in CIDR network %s' % (base_ip
,
2106 raise Error('Failed to infer CIDR network for mon ip %s; pass '
2107 '--skip-mon-network to configure it later' % base_ip
)
2110 cp
= read_config(args
.config
)
2111 if not cp
.has_section('global'):
2112 cp
.add_section('global')
2113 cp
.set('global', 'fsid', fsid
);
2114 cp
.set('global', 'mon host', addr_arg
)
2115 cp
.set('global', 'container_image', args
.image
)
2118 config
= cpf
.getvalue()
2120 if not args
.skip_pull
:
2121 logger
.info('Pulling latest %s container...' % args
.image
)
2122 call_throws([container_path
, 'pull', args
.image
])
2124 logger
.info('Extracting ceph user uid/gid from container image...')
2125 (uid
, gid
) = extract_uid_gid()
2127 # create some initial keys
2128 logger
.info('Creating initial keys...')
2129 mon_key
= CephContainer(
2131 entrypoint
='/usr/bin/ceph-authtool',
2132 args
=['--gen-print-key'],
2134 admin_key
= CephContainer(
2136 entrypoint
='/usr/bin/ceph-authtool',
2137 args
=['--gen-print-key'],
2139 mgr_key
= CephContainer(
2141 entrypoint
='/usr/bin/ceph-authtool',
2142 args
=['--gen-print-key'],
2145 keyring
= ('[mon.]\n'
2147 '\tcaps mon = allow *\n'
2150 '\tcaps mon = allow *\n'
2151 '\tcaps mds = allow *\n'
2152 '\tcaps mgr = allow *\n'
2153 '\tcaps osd = allow *\n'
2156 '\tcaps mon = profile mgr\n'
2157 '\tcaps mds = allow *\n'
2158 '\tcaps osd = allow *\n'
2159 % (mon_key
, admin_key
, mgr_id
, mgr_key
))
2162 tmp_bootstrap_keyring
= write_tmp(keyring
, uid
, gid
)
2164 # create initial monmap, tmp monmap file
2165 logger
.info('Creating initial monmap...')
2166 tmp_monmap
= write_tmp('', 0, 0)
2167 out
= CephContainer(
2169 entrypoint
='/usr/bin/monmaptool',
2173 '--addv', mon_id
, addr_arg
,
2177 tmp_monmap
.name
: '/tmp/monmap:z',
2181 # pass monmap file to ceph user for use by ceph-mon --mkfs below
2182 os
.fchown(tmp_monmap
.fileno(), uid
, gid
)
2185 logger
.info('Creating mon...')
2186 create_daemon_dirs(fsid
, 'mon', mon_id
, uid
, gid
)
2187 mon_dir
= get_data_dir(fsid
, 'mon', mon_id
)
2188 log_dir
= get_log_dir(fsid
)
2189 out
= CephContainer(
2191 entrypoint
='/usr/bin/ceph-mon',
2196 '--monmap', '/tmp/monmap',
2197 '--keyring', '/tmp/keyring',
2198 ] + get_daemon_args(fsid
, 'mon', mon_id
),
2200 log_dir
: '/var/log/ceph:z',
2201 mon_dir
: '/var/lib/ceph/mon/ceph-%s:z' % (mon_id
),
2202 tmp_bootstrap_keyring
.name
: '/tmp/keyring:z',
2203 tmp_monmap
.name
: '/tmp/monmap:z',
2207 with
open(mon_dir
+ '/config', 'w') as f
:
2208 os
.fchown(f
.fileno(), uid
, gid
)
2209 os
.fchmod(f
.fileno(), 0o600)
2212 make_var_run(fsid
, uid
, gid
)
2213 mon_c
= get_container(fsid
, 'mon', mon_id
)
2214 deploy_daemon(fsid
, 'mon', mon_id
, mon_c
, uid
, gid
,
2215 config
=None, keyring
=None)
2217 # client.admin key + config to issue various CLI commands
2218 tmp_admin_keyring
= write_tmp('[client.admin]\n'
2219 '\tkey = ' + admin_key
+ '\n',
2221 tmp_config
= write_tmp(config
, uid
, gid
)
2223 # a CLI helper to reduce our typing
2224 def cli(cmd
, extra_mounts
={}, timeout
=DEFAULT_TIMEOUT
):
2225 # type: (List[str], Dict[str, str], Optional[int]) -> str
2227 log_dir
: '/var/log/ceph:z',
2228 tmp_admin_keyring
.name
: '/etc/ceph/ceph.client.admin.keyring:z',
2229 tmp_config
.name
: '/etc/ceph/ceph.conf:z',
2231 for k
, v
in extra_mounts
.items():
2233 timeout
= timeout
or args
.timeout
2234 return CephContainer(
2236 entrypoint
='/usr/bin/ceph',
2238 volume_mounts
=mounts
,
2239 ).run(timeout
=timeout
)
2241 logger
.info('Waiting for mon to start...')
2244 entrypoint
='/usr/bin/ceph',
2248 mon_dir
: '/var/lib/ceph/mon/ceph-%s:z' % (mon_id
),
2249 tmp_admin_keyring
.name
: '/etc/ceph/ceph.client.admin.keyring:z',
2250 tmp_config
.name
: '/etc/ceph/ceph.conf:z',
2254 # wait for the service to become available
2255 def is_mon_available():
2257 timeout
=args
.timeout
if args
.timeout
else 30 # seconds
2258 out
, err
, ret
= call(c
.run_cmd(),
2262 is_available('mon', is_mon_available
)
2264 # assimilate and minimize config
2265 if not args
.no_minimize_config
:
2266 logger
.info('Assimilating anything we can from ceph.conf...')
2268 'config', 'assimilate-conf',
2269 '-i', '/var/lib/ceph/mon/ceph-%s/config' % mon_id
2271 mon_dir
: '/var/lib/ceph/mon/ceph-%s:z' % mon_id
2273 logger
.info('Generating new minimal ceph.conf...')
2275 'config', 'generate-minimal-conf',
2276 '-o', '/var/lib/ceph/mon/ceph-%s/config' % mon_id
2278 mon_dir
: '/var/lib/ceph/mon/ceph-%s:z' % mon_id
2280 # re-read our minimized config
2281 with
open(mon_dir
+ '/config', 'r') as f
:
2283 logger
.info('Restarting the monitor...')
2287 get_unit_name(fsid
, 'mon', mon_id
)
2291 logger
.info('Setting mon public_network...')
2292 cli(['config', 'set', 'mon', 'public_network', mon_network
])
2295 logger
.info('Creating mgr...')
2296 mgr_keyring
= '[mgr.%s]\n\tkey = %s\n' % (mgr_id
, mgr_key
)
2297 mgr_c
= get_container(fsid
, 'mgr', mgr_id
)
2298 deploy_daemon(fsid
, 'mgr', mgr_id
, mgr_c
, uid
, gid
,
2299 config
=config
, keyring
=mgr_keyring
)
2302 with
open(args
.output_keyring
, 'w') as f
:
2303 os
.fchmod(f
.fileno(), 0o600)
2304 f
.write('[client.admin]\n'
2305 '\tkey = ' + admin_key
+ '\n')
2306 logger
.info('Wrote keyring to %s' % args
.output_keyring
)
2308 with
open(args
.output_config
, 'w') as f
:
2310 logger
.info('Wrote config to %s' % args
.output_config
)
2312 # wait for the service to become available
2313 logger
.info('Waiting for mgr to start...')
2314 def is_mgr_available():
2316 timeout
=args
.timeout
if args
.timeout
else 30 # seconds
2317 out
= cli(['status', '-f', 'json-pretty'], timeout
=timeout
)
2319 return j
.get('mgrmap', {}).get('available', False)
2320 is_available('mgr', is_mgr_available
)
2322 # wait for mgr to restart (after enabling a module)
2323 def wait_for_mgr_restart():
2324 # first get latest mgrmap epoch from the mon
2325 out
= cli(['mgr', 'dump'])
2328 # wait for mgr to have it
2329 logger
.info('Waiting for the mgr to restart...')
2330 def mgr_has_latest_epoch():
2333 out
= cli(['tell', 'mgr', 'mgr_status'])
2335 return j
['mgrmap_epoch'] >= epoch
2336 except Exception as e
:
2337 logger
.debug('tell mgr mgr_status failed: %s' % e
)
2339 is_available('Mgr epoch %d' % epoch
, mgr_has_latest_epoch
)
2342 if not args
.skip_ssh
:
2343 logger
.info('Enabling cephadm module...')
2344 cli(['mgr', 'module', 'enable', 'cephadm'])
2345 wait_for_mgr_restart()
2347 logger
.info('Setting orchestrator backend to cephadm...')
2348 cli(['orch', 'set', 'backend', 'cephadm'])
2350 logger
.info('Generating ssh key...')
2351 cli(['cephadm', 'generate-key'])
2352 ssh_pub
= cli(['cephadm', 'get-pub-key'])
2354 with
open(args
.output_pub_ssh_key
, 'w') as f
:
2356 logger
.info('Wrote public SSH key to to %s' % args
.output_pub_ssh_key
)
2358 logger
.info('Adding key to root@localhost\'s authorized_keys...')
2359 if not os
.path
.exists('/root/.ssh'):
2360 os
.mkdir('/root/.ssh', 0o700)
2361 auth_keys_file
= '/root/.ssh/authorized_keys'
2363 if os
.path
.exists(auth_keys_file
):
2364 with
open(auth_keys_file
, 'r') as f
:
2365 f
.seek(0, os
.SEEK_END
)
2367 f
.seek(f
.tell()-1, os
.SEEK_SET
) # go to last char
2368 if f
.read() != '\n':
2370 with
open(auth_keys_file
, 'a') as f
:
2371 os
.fchmod(f
.fileno(), 0o600) # just in case we created it
2374 f
.write(ssh_pub
.strip() + '\n')
2376 host
= get_hostname()
2377 logger
.info('Adding host %s...' % host
)
2378 cli(['orch', 'host', 'add', host
])
2380 if not args
.orphan_initial_daemons
:
2381 for t
in ['mon', 'mgr', 'crash']:
2382 logger
.info('Deploying %s service with default placement...' % t
)
2383 cli(['orch', 'apply', t
])
2385 if not args
.skip_monitoring_stack
:
2386 logger
.info('Enabling mgr prometheus module...')
2387 cli(['mgr', 'module', 'enable', 'prometheus'])
2388 for t
in ['prometheus', 'grafana', 'node-exporter', 'alertmanager']:
2389 logger
.info('Deploying %s service with default placement...' % t
)
2390 cli(['orch', 'apply', t
])
2392 if not args
.skip_dashboard
:
2393 logger
.info('Enabling the dashboard module...')
2394 cli(['mgr', 'module', 'enable', 'dashboard'])
2395 wait_for_mgr_restart()
2397 # dashboard crt and key
2398 if args
.dashboard_key
and args
.dashboard_crt
:
2399 logger
.info('Using provided dashboard certificate...')
2401 mounts
[pathify(args
.dashboard_crt
)] = '/tmp/dashboard.crt:z'
2402 mounts
[pathify(args
.dashboard_key
)] = '/tmp/dashboard.key:z'
2403 cli(['dashboard', 'set-ssl-certificate', '-i', '/tmp/dashboard.crt'], extra_mounts
=mounts
)
2404 cli(['dashboard', 'set-ssl-certificate-key', '-i', '/tmp/dashboard.key'], extra_mounts
=mounts
)
2406 logger
.info('Generating a dashboard self-signed certificate...')
2407 cli(['dashboard', 'create-self-signed-cert'])
2409 logger
.info('Creating initial admin user...')
2410 password
= args
.initial_dashboard_password
or generate_password()
2411 cmd
= ['dashboard', 'ac-user-create', args
.initial_dashboard_user
, password
, 'administrator', '--force-password']
2412 if not args
.dashboard_password_noupdate
:
2413 cmd
.append('--pwd-update-required')
2415 logger
.info('Fetching dashboard port number...')
2416 out
= cli(['config', 'get', 'mgr', 'mgr/dashboard/ssl_server_port'])
2419 logger
.info('Ceph Dashboard is now available at:\n\n'
2420 '\t URL: https://%s:%s/\n'
2422 '\tPassword: %s\n' % (
2424 args
.initial_dashboard_user
,
2427 logger
.info('You can access the Ceph CLI with:\n\n'
2428 '\tsudo %s shell --fsid %s -c %s -k %s\n' % (
2432 args
.output_keyring
))
2433 logger
.info('Please consider enabling telemetry to help improve Ceph:\n\n'
2434 '\tceph telemetry on\n\n'
2435 'For more information see:\n\n'
2436 '\thttps://docs.ceph.com/docs/master/mgr/telemetry/\n')
2437 logger
.info('Bootstrap complete.')
2440 ##################################
2442 def extract_uid_gid_monitoring(daemon_type
):
2443 # type: (str) -> Tuple[int, int]
2445 if daemon_type
== 'prometheus':
2446 uid
, gid
= extract_uid_gid(file_path
='/etc/prometheus')
2447 elif daemon_type
== 'node-exporter':
2448 uid
, gid
= 65534, 65534
2449 elif daemon_type
== 'grafana':
2450 uid
, gid
= extract_uid_gid(file_path
='/var/lib/grafana')
2451 elif daemon_type
== 'alertmanager':
2452 uid
, gid
= extract_uid_gid(file_path
='/etc/alertmanager')
2454 raise Error("{} not implemented yet".format(daemon_type
))
2459 def command_deploy():
2461 (daemon_type
, daemon_id
) = args
.name
.split('.', 1)
2463 l
= FileLock(args
.fsid
)
2466 if daemon_type
not in get_supported_daemons():
2467 raise Error('daemon type %s not recognized' % daemon_type
)
2469 logger
.info('Deploying daemon %s.%s ...' % (daemon_type
, daemon_id
))
2471 if daemon_type
in Ceph
.daemons
:
2472 (config
, keyring
) = get_config_and_keyring()
2473 (uid
, gid
) = extract_uid_gid()
2474 make_var_run(args
.fsid
, uid
, gid
)
2475 c
= get_container(args
.fsid
, daemon_type
, daemon_id
,
2476 ptrace
=args
.allow_ptrace
)
2477 deploy_daemon(args
.fsid
, daemon_type
, daemon_id
, c
, uid
, gid
,
2478 config
=config
, keyring
=keyring
,
2479 osd_fsid
=args
.osd_fsid
,
2480 reconfig
=args
.reconfig
)
2482 elif daemon_type
in Monitoring
.components
:
2483 # monitoring daemon - prometheus, grafana, alertmanager, node-exporter
2484 monitoring_args
= [] # type: List[str]
2487 if not args
.reconfig
:
2488 daemon_ports
= Monitoring
.port_map
[daemon_type
] # type: List[int]
2489 if any([port_in_use(port
) for port
in daemon_ports
]):
2490 raise Error("TCP Port(s) '{}' required for {} is already in use".format(",".join(map(str, daemon_ports
)), daemon_type
))
2492 # make sure provided config-json is sufficient
2493 config
= get_parm(args
.config_json
) # type: ignore
2494 required_files
= Monitoring
.components
[daemon_type
].get('config-json-files', list())
2495 required_args
= Monitoring
.components
[daemon_type
].get('config-json-args', list())
2497 if not config
or not all(c
in config
.get('files', {}).keys() for c
in required_files
): # type: ignore
2498 raise Error("{} deployment requires config-json which must "
2499 "contain file content for {}".format(daemon_type
.capitalize(), ', '.join(required_files
)))
2501 if not config
or not all(c
in config
.keys() for c
in required_args
): # type: ignore
2502 raise Error("{} deployment requires config-json which must "
2503 "contain arg for {}".format(daemon_type
.capitalize(), ', '.join(required_args
)))
2506 uid
, gid
= extract_uid_gid_monitoring(daemon_type
)
2507 c
= get_container(args
.fsid
, daemon_type
, daemon_id
)
2508 deploy_daemon(args
.fsid
, daemon_type
, daemon_id
, c
, uid
, gid
,
2509 reconfig
=args
.reconfig
)
2511 elif daemon_type
== NFSGanesha
.daemon_type
:
2512 NFSGanesha
.port_in_use()
2513 (config
, keyring
) = get_config_and_keyring()
2514 # TODO: extract ganesha uid/gid (997, 994) ?
2515 (uid
, gid
) = extract_uid_gid()
2516 c
= get_container(args
.fsid
, daemon_type
, daemon_id
)
2517 deploy_daemon(args
.fsid
, daemon_type
, daemon_id
, c
, uid
, gid
,
2518 config
=config
, keyring
=keyring
,
2519 reconfig
=args
.reconfig
)
2521 raise Error("{} not implemented in command_deploy function".format(daemon_type
))
2523 ##################################
2528 (daemon_type
, daemon_id
) = args
.name
.split('.', 1)
2529 c
= get_container(args
.fsid
, daemon_type
, daemon_id
)
2530 command
= c
.run_cmd()
2531 return call_timeout(command
, args
.timeout
)
2533 ##################################
2537 def command_shell():
2540 make_log_dir(args
.fsid
)
2542 if '.' in args
.name
:
2543 (daemon_type
, daemon_id
) = args
.name
.split('.', 1)
2545 daemon_type
= args
.name
2548 daemon_type
= 'osd' # get the most mounts
2551 if daemon_id
and not args
.fsid
:
2552 raise Error('must pass --fsid to specify cluster')
2554 # use /etc/ceph files by default, if present. we do this instead of
2555 # making these defaults in the arg parser because we don't want an error
2556 # if they don't exist.
2557 if not args
.config
and os
.path
.exists(SHELL_DEFAULT_CONF
):
2558 args
.config
= SHELL_DEFAULT_CONF
2559 if not args
.keyring
and os
.path
.exists(SHELL_DEFAULT_KEYRING
):
2560 args
.keyring
= SHELL_DEFAULT_KEYRING
2562 container_args
= [] # type: List[str]
2563 mounts
= get_container_mounts(args
.fsid
, daemon_type
, daemon_id
,
2564 no_config
=True if args
.config
else False)
2566 mounts
[pathify(args
.config
)] = '/etc/ceph/ceph.conf:z'
2568 mounts
[pathify(args
.keyring
)] = '/etc/ceph/ceph.keyring:z'
2570 command
= args
.command
2576 '-e', "PS1=%s" % CUSTOM_PS1
,
2579 home
= os
.path
.join(args
.data_dir
, args
.fsid
, 'home')
2580 if not os
.path
.exists(home
):
2581 logger
.debug('Creating root home at %s' % home
)
2582 makedirs(home
, 0, 0, 0o660)
2583 if os
.path
.exists('/etc/skel'):
2584 for f
in os
.listdir('/etc/skel'):
2585 if f
.startswith('.bash'):
2586 shutil
.copyfile(os
.path
.join('/etc/skel', f
),
2587 os
.path
.join(home
, f
))
2588 mounts
[home
] = '/root'
2592 entrypoint
='doesnotmatter',
2594 container_args
=container_args
,
2595 volume_mounts
=mounts
,
2598 command
= c
.shell_cmd(command
)
2600 return call_timeout(command
, args
.timeout
)
2602 ##################################
2605 def command_enter():
2608 raise Error('must pass --fsid to specify cluster')
2609 (daemon_type
, daemon_id
) = args
.name
.split('.', 1)
2610 container_args
= [] # type: List[str]
2612 command
= args
.command
2618 '-e', "PS1=%s" % CUSTOM_PS1
,
2620 c
= get_container(args
.fsid
, daemon_type
, daemon_id
,
2621 container_args
=container_args
)
2622 command
= c
.exec_cmd(command
)
2623 return call_timeout(command
, args
.timeout
)
2625 ##################################
2629 def command_ceph_volume():
2632 make_log_dir(args
.fsid
)
2634 (uid
, gid
) = (0, 0) # ceph-volume runs as root
2635 mounts
= get_container_mounts(args
.fsid
, 'osd', None)
2640 if args
.config_json
:
2641 # note: this will always pull from args.config_json (we
2642 # require it) and never args.config or args.keyring.
2643 (config
, keyring
) = get_config_and_keyring()
2646 tmp_keyring
= write_tmp(keyring
, uid
, gid
)
2649 tmp_config
= write_tmp(config
, uid
, gid
)
2651 mounts
[tmp_config
.name
] = '/etc/ceph/ceph.conf:z'
2652 mounts
[tmp_keyring
.name
] = '/var/lib/ceph/bootstrap-osd/ceph.keyring:z'
2656 entrypoint
='/usr/sbin/ceph-volume',
2659 volume_mounts
=mounts
,
2661 out
, err
, code
= call_throws(c
.run_cmd(), verbose
=True)
2665 ##################################
2671 raise Error('must pass --fsid to specify cluster')
2672 (daemon_type
, daemon_id
) = args
.name
.split('.', 1)
2673 unit_name
= get_unit_name(args
.fsid
, daemon_type
, daemon_id
)
2679 ##################################
2685 raise Error('must pass --fsid to specify cluster')
2687 (daemon_type
, daemon_id
) = args
.name
.split('.', 1)
2688 unit_name
= get_unit_name(args
.fsid
, daemon_type
, daemon_id
)
2690 cmd
= [find_program('journalctl')]
2691 cmd
.extend(['-u', unit_name
])
2693 cmd
.extend(args
.command
)
2695 # call this directly, without our wrapper, so that we get an unmolested
2696 # stdout with logger prefixing.
2697 logger
.debug("Running command: %s" % ' '.join(cmd
))
2698 subprocess
.call(cmd
) # type: ignore
2700 ##################################
2702 def list_networks():
2703 # type: () -> Dict[str,List[str]]
2705 ## sadly, 18.04's iproute2 4.15.0-2ubun doesn't support the -j flag,
2706 ## so we'll need to use a regex to parse 'ip' command output.
2707 #out, _, _ = call_throws(['ip', '-j', 'route', 'ls'])
2708 #j = json.loads(out)
2711 out
, _
, _
= call_throws([find_executable('ip'), 'route', 'ls'])
2712 return _parse_ip_route(out
)
2714 def _parse_ip_route(out
):
2715 r
= {} # type: Dict[str,List[str]]
2716 p
= re
.compile(r
'^(\S+) (.*)scope link (.*)src (\S+)')
2717 for line
in out
.splitlines():
2728 def command_list_networks():
2731 print(json
.dumps(r
, indent
=4))
2733 ##################################
2737 ls
= list_daemons(detail
=not args
.no_detail
,
2738 legacy_dir
=args
.legacy_dir
)
2739 print(json
.dumps(ls
, indent
=4))
2741 def list_daemons(detail
=True, legacy_dir
=None):
2742 # type: (bool, Optional[str]) -> List[Dict[str, str]]
2746 data_dir
= args
.data_dir
2747 if legacy_dir
is not None:
2748 data_dir
= os
.path
.abspath(legacy_dir
+ data_dir
)
2750 # keep track of ceph versions we see
2751 seen_versions
= {} # type: Dict[str, Optional[str]]
2754 if os
.path
.exists(data_dir
):
2755 for i
in os
.listdir(data_dir
):
2756 if i
in ['mon', 'osd', 'mds', 'mgr']:
2758 for j
in os
.listdir(os
.path
.join(data_dir
, i
)):
2761 (cluster
, daemon_id
) = j
.split('-', 1)
2762 fsid
= get_legacy_daemon_fsid(
2763 cluster
, daemon_type
, daemon_id
,
2764 legacy_dir
=legacy_dir
)
2767 'name': '%s.%s' % (daemon_type
, daemon_id
),
2768 'fsid': fsid
if fsid
is not None else 'unknown',
2771 (i
['enabled'], i
['state'], _
) = check_unit(
2772 'ceph-%s@%s' % (daemon_type
, daemon_id
))
2773 if not host_version
:
2775 out
, err
, code
= call(['ceph', '-v'])
2776 if not code
and out
.startswith('ceph version '):
2777 host_version
= out
.split(' ')[2]
2780 i
['host_version'] = host_version
2783 fsid
= str(i
) # convince mypy that fsid is a str here
2784 for j
in os
.listdir(os
.path
.join(data_dir
, i
)):
2787 (daemon_type
, daemon_id
) = j
.split('.', 1)
2788 unit_name
= get_unit_name(fsid
,
2794 'style': 'cephadm:v1',
2800 (i
['enabled'], i
['state'], _
) = check_unit(unit_name
)
2807 if 'podman' in container_path
and get_podman_version() < (1, 6, 2):
2808 image_field
= '.ImageID'
2810 image_field
= '.Image'
2812 out
, err
, code
= call(
2814 container_path
, 'inspect',
2815 '--format', '{{.Id}},{{.Config.Image}},{{%s}},{{.Created}},{{index .Config.Labels "io.ceph.version"}}' % image_field
,
2816 'ceph-%s-%s' % (fsid
, j
)
2818 verbose_on_failure
=False)
2820 (container_id
, image_name
, image_id
, start
,
2821 version
) = out
.strip().split(',')
2822 image_id
= normalize_container_id(image_id
)
2823 daemon_type
= name
.split('.', 1)[0]
2824 start_stamp
= try_convert_datetime(start
)
2825 if not version
or '.' not in version
:
2826 version
= seen_versions
.get(image_id
, None)
2827 if daemon_type
== NFSGanesha
.daemon_type
:
2828 version
= NFSGanesha
.get_version(container_id
)
2830 if daemon_type
in Ceph
.daemons
:
2831 out
, err
, code
= call(
2832 [container_path
, 'exec', container_id
,
2835 out
.startswith('ceph version '):
2836 version
= out
.split(' ')[2]
2837 seen_versions
[image_id
] = version
2838 elif daemon_type
== 'grafana':
2839 out
, err
, code
= call(
2840 [container_path
, 'exec', container_id
,
2841 'grafana-server', '-v'])
2843 out
.startswith('Version '):
2844 version
= out
.split(' ')[1]
2845 seen_versions
[image_id
] = version
2846 elif daemon_type
in ['prometheus',
2849 cmd
= daemon_type
.replace('-', '_')
2850 out
, err
, code
= call(
2851 [container_path
, 'exec', container_id
,
2854 err
.startswith('%s, version ' % cmd
):
2855 version
= err
.split(' ')[2]
2856 seen_versions
[image_id
] = version
2858 logging
.warning('version for unknown daemon type %s' % daemon_type
)
2860 vfile
= os
.path
.join(data_dir
, fsid
, j
, 'unit.image') # type: ignore
2862 with
open(vfile
, 'r') as f
:
2863 image_name
= f
.read().strip() or None
2866 i
['container_id'] = container_id
2867 i
['container_image_name'] = image_name
2868 i
['container_image_id'] = image_id
2869 i
['version'] = version
2870 i
['started'] = start_stamp
2871 i
['created'] = get_file_timestamp(
2872 os
.path
.join(data_dir
, fsid
, j
, 'unit.created')
2874 i
['deployed'] = get_file_timestamp(
2875 os
.path
.join(data_dir
, fsid
, j
, 'unit.image'))
2876 i
['configured'] = get_file_timestamp(
2877 os
.path
.join(data_dir
, fsid
, j
, 'unit.configured'))
2886 ##################################
2889 def command_adopt():
2892 if not args
.skip_pull
:
2893 logger
.info('Pulling latest %s container...' % args
.image
)
2894 call_throws([container_path
, 'pull', args
.image
])
2896 (daemon_type
, daemon_id
) = args
.name
.split('.', 1)
2899 if args
.style
!= 'legacy':
2900 raise Error('adoption of style %s not implemented' % args
.style
)
2903 fsid
= get_legacy_daemon_fsid(args
.cluster
,
2906 legacy_dir
=args
.legacy_dir
)
2908 raise Error('could not detect legacy fsid; set fsid in ceph.conf')
2912 # call correct adoption
2913 if daemon_type
in Ceph
.daemons
:
2914 command_adopt_ceph(daemon_type
, daemon_id
, fsid
);
2915 elif daemon_type
== 'prometheus':
2916 command_adopt_prometheus(daemon_id
, fsid
)
2917 elif daemon_type
== 'grafana':
2918 command_adopt_grafana(daemon_id
, fsid
)
2919 elif daemon_type
== 'node-exporter':
2920 raise Error('adoption of node-exporter not implemented')
2921 elif daemon_type
== 'alertmanager':
2922 raise Error('adoption of alert-manager not implemented')
2924 raise Error('daemon type %s not recognized' % daemon_type
)
2928 def command_adopt_ceph(daemon_type
, daemon_id
, fsid
):
2929 # type: (str, str, str) -> None
2931 (uid
, gid
) = extract_uid_gid()
2933 data_dir_src
= ('/var/lib/ceph/%s/%s-%s' %
2934 (daemon_type
, args
.cluster
, daemon_id
))
2935 data_dir_src
= os
.path
.abspath(args
.legacy_dir
+ data_dir_src
)
2938 if daemon_type
== 'osd':
2939 path
= os
.path
.join(data_dir_src
, 'fsid')
2941 with
open(path
, 'r') as f
:
2942 osd_fsid
= f
.read().strip()
2944 raise Error('unable to read OSD fsid from %s' % path
)
2946 if os
.path
.exists(os
.path
.join(data_dir_src
, 'type')):
2947 with
open(os
.path
.join(data_dir_src
, 'type')) as f
:
2948 os_type
= f
.read().strip()
2950 raise Error('"type" file missing for OSD data dir')
2951 logger
.info('objectstore_type is %s' % os_type
)
2952 if os_type
== 'filestore':
2953 raise Error('FileStore is not supported by cephadm')
2955 # NOTE: implicit assumption here that the units correspond to the
2956 # cluster we are adopting based on the /etc/{defaults,sysconfig}/ceph
2958 unit_name
= 'ceph-%s@%s' % (daemon_type
, daemon_id
)
2959 (enabled
, state
, _
) = check_unit(unit_name
)
2960 if state
== 'running':
2961 logger
.info('Stopping old systemd unit %s...' % unit_name
)
2962 call_throws(['systemctl', 'stop', unit_name
])
2964 logger
.info('Disabling old systemd unit %s...' % unit_name
)
2965 call_throws(['systemctl', 'disable', unit_name
])
2968 logger
.info('Moving data...')
2969 data_dir_dst
= make_data_dir(fsid
, daemon_type
, daemon_id
,
2971 move_files(glob(os
.path
.join(data_dir_src
, '*')),
2974 logger
.debug('Remove dir \'%s\'' % (data_dir_src
))
2975 if os
.path
.ismount(data_dir_src
):
2976 call_throws(['umount', data_dir_src
])
2977 os
.rmdir(data_dir_src
)
2979 logger
.info('Chowning content...')
2980 call_throws(['chown', '-c', '-R', '%d.%d' % (uid
, gid
), data_dir_dst
])
2982 if daemon_type
== 'mon':
2983 # rename *.ldb -> *.sst, in case they are coming from ubuntu
2984 store
= os
.path
.join(data_dir_dst
, 'store.db')
2986 if os
.path
.exists(store
):
2987 for oldf
in os
.listdir(store
):
2988 if oldf
.endswith('.ldb'):
2989 newf
= oldf
.replace('.ldb', '.sst')
2990 oldp
= os
.path
.join(store
, oldf
)
2991 newp
= os
.path
.join(store
, newf
)
2992 logger
.debug('Renaming %s -> %s' % (oldp
, newp
))
2993 os
.rename(oldp
, newp
)
2995 logger
.info('Renamed %d leveldb *.ldb files to *.sst',
2997 if daemon_type
== 'osd':
2998 for n
in ['block', 'block.db', 'block.wal']:
2999 p
= os
.path
.join(data_dir_dst
, n
)
3000 if os
.path
.exists(p
):
3001 logger
.info('Chowning %s...' % p
)
3002 os
.chown(p
, uid
, gid
)
3003 # disable the ceph-volume 'simple' mode files on the host
3004 simple_fn
= os
.path
.join('/etc/ceph/osd',
3005 '%s-%s.json' % (daemon_id
, osd_fsid
))
3006 if os
.path
.exists(simple_fn
):
3007 new_fn
= simple_fn
+ '.adopted-by-cephadm'
3008 logger
.info('Renaming %s -> %s', simple_fn
, new_fn
)
3009 os
.rename(simple_fn
, new_fn
)
3010 logger
.info('Disabling host unit ceph-volume@ simple unit...')
3011 call_throws(['systemctl', 'disable',
3012 'ceph-volume@simple-%s-%s.service' % (
3013 daemon_id
, osd_fsid
)])
3015 # assume this is an 'lvm' c-v for now, but don't error
3017 logger
.info('Disabling host unit ceph-volume@ lvm unit...')
3018 call(['systemctl', 'disable',
3019 'ceph-volume@lvm-%s-%s.service' % (daemon_id
, osd_fsid
)])
3022 config_src
= '/etc/ceph/%s.conf' % (args
.cluster
)
3023 config_src
= os
.path
.abspath(args
.legacy_dir
+ config_src
)
3024 config_dst
= os
.path
.join(data_dir_dst
, 'config')
3025 copy_files([config_src
], config_dst
, uid
=uid
, gid
=gid
)
3028 logger
.info('Moving logs...')
3029 log_dir_src
= ('/var/log/ceph/%s-%s.%s.log*' %
3030 (args
.cluster
, daemon_type
, daemon_id
))
3031 log_dir_src
= os
.path
.abspath(args
.legacy_dir
+ log_dir_src
)
3032 log_dir_dst
= make_log_dir(fsid
, uid
=uid
, gid
=gid
)
3033 move_files(glob(log_dir_src
),
3037 logger
.info('Creating new units...')
3038 make_var_run(fsid
, uid
, gid
)
3039 c
= get_container(fsid
, daemon_type
, daemon_id
)
3040 deploy_daemon_units(fsid
, uid
, gid
, daemon_type
, daemon_id
, c
,
3041 enable
=True, # unconditionally enable the new unit
3042 start
=(state
== 'running'),
3044 update_firewalld(daemon_type
)
3047 def command_adopt_prometheus(daemon_id
, fsid
):
3048 # type: (str, str) -> None
3050 daemon_type
= 'prometheus'
3051 (uid
, gid
) = extract_uid_gid_monitoring(daemon_type
)
3053 _stop_and_disable('prometheus')
3055 data_dir_dst
= make_data_dir(fsid
, daemon_type
, daemon_id
,
3059 config_src
= '/etc/prometheus/prometheus.yml'
3060 config_src
= os
.path
.abspath(args
.legacy_dir
+ config_src
)
3061 config_dst
= os
.path
.join(data_dir_dst
, 'etc/prometheus')
3062 copy_files([config_src
], config_dst
, uid
=uid
, gid
=gid
)
3065 data_src
= '/var/lib/prometheus/metrics/'
3066 data_src
= os
.path
.abspath(args
.legacy_dir
+ data_src
)
3067 data_dst
= os
.path
.join(data_dir_dst
, 'data')
3068 copy_tree([data_src
], data_dst
, uid
=uid
, gid
=gid
)
3070 make_var_run(fsid
, uid
, gid
)
3071 c
= get_container(fsid
, daemon_type
, daemon_id
)
3072 deploy_daemon(fsid
, daemon_type
, daemon_id
, c
, uid
, gid
)
3073 update_firewalld(daemon_type
)
3075 def command_adopt_grafana(daemon_id
, fsid
):
3076 # type: (str, str) -> None
3078 daemon_type
= 'grafana'
3079 (uid
, gid
) = extract_uid_gid_monitoring(daemon_type
)
3081 _stop_and_disable('grafana-server')
3083 data_dir_dst
= make_data_dir(fsid
, daemon_type
, daemon_id
,
3087 config_src
= '/etc/grafana/grafana.ini'
3088 config_src
= os
.path
.abspath(args
.legacy_dir
+ config_src
)
3089 config_dst
= os
.path
.join(data_dir_dst
, 'etc/grafana')
3090 makedirs(config_dst
, uid
, gid
, 0o755)
3091 copy_files([config_src
], config_dst
, uid
=uid
, gid
=gid
)
3093 prov_src
= '/etc/grafana/provisioning/'
3094 prov_src
= os
.path
.abspath(args
.legacy_dir
+ prov_src
)
3095 prov_dst
= os
.path
.join(data_dir_dst
, 'etc/grafana')
3096 copy_tree([prov_src
], prov_dst
, uid
=uid
, gid
=gid
)
3099 cert
= '/etc/grafana/grafana.crt'
3100 key
= '/etc/grafana/grafana.key'
3101 if os
.path
.exists(cert
) and os
.path
.exists(key
):
3102 cert_src
= '/etc/grafana/grafana.crt'
3103 cert_src
= os
.path
.abspath(args
.legacy_dir
+ cert_src
)
3104 makedirs(os
.path
.join(data_dir_dst
, 'etc/grafana/certs'), uid
, gid
, 0o755)
3105 cert_dst
= os
.path
.join(data_dir_dst
, 'etc/grafana/certs/cert_file')
3106 copy_files([cert_src
], cert_dst
, uid
=uid
, gid
=gid
)
3108 key_src
= '/etc/grafana/grafana.key'
3109 key_src
= os
.path
.abspath(args
.legacy_dir
+ key_src
)
3110 key_dst
= os
.path
.join(data_dir_dst
, 'etc/grafana/certs/cert_key')
3111 copy_files([key_src
], key_dst
, uid
=uid
, gid
=gid
)
3113 _adjust_grafana_ini(os
.path
.join(config_dst
, 'grafana.ini'))
3115 logger
.debug("Skipping ssl, missing cert {} or key {}".format(cert
, key
))
3118 # data - possible custom dashboards/plugins
3119 data_src
= '/var/lib/grafana/'
3120 data_src
= os
.path
.abspath(args
.legacy_dir
+ data_src
)
3121 data_dst
= os
.path
.join(data_dir_dst
, 'data')
3122 copy_tree([data_src
], data_dst
, uid
=uid
, gid
=gid
)
3124 make_var_run(fsid
, uid
, gid
)
3125 c
= get_container(fsid
, daemon_type
, daemon_id
)
3126 deploy_daemon(fsid
, daemon_type
, daemon_id
, c
, uid
, gid
)
3127 update_firewalld(daemon_type
)
3129 def _adjust_grafana_ini(filename
):
3130 # type: (str) -> None
3132 # Update cert_file, cert_key pathnames in server section
3133 # ConfigParser does not preserve comments
3135 with
open(filename
, "r") as grafana_ini
:
3136 lines
= grafana_ini
.readlines()
3137 with
open("{}.new".format(filename
), "w") as grafana_ini
:
3138 server_section
=False
3140 if line
.startswith('['):
3141 server_section
=False
3142 if line
.startswith('[server]'):
3145 line
= re
.sub(r
'^cert_file.*',
3146 'cert_file = /etc/grafana/certs/cert_file', line
)
3147 line
= re
.sub(r
'^cert_key.*',
3148 'cert_key = /etc/grafana/certs/cert_key', line
)
3149 grafana_ini
.write(line
)
3150 os
.rename("{}.new".format(filename
), filename
)
3151 except OSError as err
:
3152 raise Error("Cannot update {}: {}".format(filename
, err
))
3155 def _stop_and_disable(unit_name
):
3156 # type: (str) -> None
3158 (enabled
, state
, _
) = check_unit(unit_name
)
3159 if state
== 'running':
3160 logger
.info('Stopping old systemd unit %s...' % unit_name
)
3161 call_throws(['systemctl', 'stop', unit_name
])
3163 logger
.info('Disabling old systemd unit %s...' % unit_name
)
3164 call_throws(['systemctl', 'disable', unit_name
])
3167 ##################################
3169 def command_rm_daemon():
3172 l
= FileLock(args
.fsid
)
3175 (daemon_type
, daemon_id
) = args
.name
.split('.', 1)
3176 if daemon_type
in ['mon', 'osd'] and not args
.force
:
3177 raise Error('must pass --force to proceed: '
3178 'this command may destroy precious data!')
3179 unit_name
= get_unit_name(args
.fsid
, daemon_type
, daemon_id
)
3180 call(['systemctl', 'stop', unit_name
],
3181 verbose_on_failure
=False)
3182 call(['systemctl', 'reset-failed', unit_name
],
3183 verbose_on_failure
=False)
3184 call(['systemctl', 'disable', unit_name
],
3185 verbose_on_failure
=False)
3186 data_dir
= get_data_dir(args
.fsid
, daemon_type
, daemon_id
)
3187 if daemon_type
in ['mon', 'osd', 'prometheus'] and \
3188 not args
.force_delete_data
:
3189 # rename it out of the way -- do not delete
3190 backup_dir
= os
.path
.join(args
.data_dir
, args
.fsid
, 'removed')
3191 if not os
.path
.exists(backup_dir
):
3192 makedirs(backup_dir
, 0, 0, DATA_DIR_MODE
)
3193 dirname
= '%s.%s_%s' % (daemon_type
, daemon_id
,
3194 datetime
.datetime
.utcnow().strftime(DATEFMT
))
3196 os
.path
.join(backup_dir
, dirname
))
3198 call_throws(['rm', '-rf', data_dir
])
3200 ##################################
3202 def command_rm_cluster():
3205 raise Error('must pass --force to proceed: '
3206 'this command may destroy precious data!')
3208 l
= FileLock(args
.fsid
)
3211 # stop + disable individual daemon units
3212 for d
in list_daemons(detail
=False):
3213 if d
['fsid'] != args
.fsid
:
3215 if d
['style'] != 'cephadm:v1':
3217 unit_name
= get_unit_name(args
.fsid
, d
['name'])
3218 call(['systemctl', 'stop', unit_name
],
3219 verbose_on_failure
=False)
3220 call(['systemctl', 'reset-failed', unit_name
],
3221 verbose_on_failure
=False)
3222 call(['systemctl', 'disable', unit_name
],
3223 verbose_on_failure
=False)
3226 for unit_name
in ['ceph-%s.target' % args
.fsid
]:
3227 call(['systemctl', 'stop', unit_name
],
3228 verbose_on_failure
=False)
3229 call(['systemctl', 'reset-failed', unit_name
],
3230 verbose_on_failure
=False)
3231 call(['systemctl', 'disable', unit_name
],
3232 verbose_on_failure
=False)
3234 slice_name
= 'system-%s.slice' % (('ceph-%s' % args
.fsid
).replace('-',
3236 call(['systemctl', 'stop', slice_name
],
3237 verbose_on_failure
=False)
3240 call_throws(['rm', '-f', args
.unit_dir
+
3241 '/ceph-%s@.service' % args
.fsid
])
3242 call_throws(['rm', '-f', args
.unit_dir
+
3243 '/ceph-%s.target' % args
.fsid
])
3244 call_throws(['rm', '-rf',
3245 args
.unit_dir
+ '/ceph-%s.target.wants' % args
.fsid
])
3247 call_throws(['rm', '-rf', args
.data_dir
+ '/' + args
.fsid
])
3249 call_throws(['rm', '-rf', args
.log_dir
+ '/' + args
.fsid
])
3250 call_throws(['rm', '-rf', args
.log_dir
+
3251 '/*.wants/ceph-%s@*' % args
.fsid
])
3252 # rm logrotate config
3253 call_throws(['rm', '-f', args
.logrotate_dir
+ '/ceph-%s' % args
.fsid
])
3256 ##################################
3258 def check_time_sync(enabler
=None):
3259 # type: (Optional[Packager]) -> bool
3261 'chrony.service', # 18.04 (at least)
3262 'chronyd.service', # el / opensuse
3263 'systemd-timesyncd.service',
3264 'ntpd.service', # el7 (at least)
3265 'ntp.service', # 18.04 (at least)
3267 if not check_units(units
, enabler
=None):
3268 logger
.warning('No time sync service is running; checked for %s' % units
)
3272 def command_check_host():
3274 # caller already checked for docker/podman
3275 logger
.info('podman|docker (%s) is present' % container_path
)
3277 commands
= ['systemctl', 'lvcreate']
3279 for command
in commands
:
3281 find_program(command
)
3282 logger
.info('%s is present' % command
)
3284 raise Error('%s binary does not appear to be installed' % command
)
3286 # check for configured+running chronyd or ntp
3287 if not check_time_sync():
3288 raise Error('No time synchronization is active')
3290 if 'expect_hostname' in args
and args
.expect_hostname
:
3291 if get_hostname() != args
.expect_hostname
:
3292 raise Error('hostname "%s" does not match expected hostname "%s"' % (
3293 get_hostname(), args
.expect_hostname
))
3294 logger
.info('Hostname "%s" matches what is expected.',
3295 args
.expect_hostname
)
3297 logger
.info('Host looks OK')
3299 ##################################
3301 def command_prepare_host():
3303 logger
.info('Verifying podman|docker is present...')
3305 if not container_path
:
3307 pkg
= create_packager()
3308 pkg
.install_podman()
3310 logger
.info('Verifying lvm2 is present...')
3311 if not find_executable('lvcreate'):
3313 pkg
= create_packager()
3314 pkg
.install(['lvm2'])
3316 logger
.info('Verifying time synchronization is in place...')
3317 if not check_time_sync():
3319 pkg
= create_packager()
3320 pkg
.install(['chrony'])
3321 # check again, and this time try to enable
3323 check_time_sync(enabler
=pkg
)
3325 if 'expect_hostname' in args
and args
.expect_hostname
and args
.expect_hostname
!= get_hostname():
3326 logger
.warning('Adjusting hostname from %s -> %s...' % (get_hostname(), args
.expect_hostname
))
3327 call_throws(['hostname', args
.expect_hostname
])
3328 with
open('/etc/hostname', 'w') as f
:
3329 f
.write(args
.expect_hostname
+ '\n')
3331 logger
.info('Repeating the final host check...')
3332 command_check_host()
3334 ##################################
3336 class CustomValidation(argparse
.Action
):
3338 def _check_name(self
, values
):
3340 (daemon_type
, daemon_id
) = values
.split('.', 1)
3342 raise argparse
.ArgumentError(self
,
3343 "must be of the format <type>.<id>. For example, osd.1 or prometheus.myhost.com")
3345 daemons
= get_supported_daemons()
3346 if daemon_type
not in daemons
:
3347 raise argparse
.ArgumentError(self
,
3348 "name must declare the type of daemon e.g. "
3349 "{}".format(', '.join(daemons
)))
3351 def __call__(self
, parser
, namespace
, values
, option_string
=None):
3352 if self
.dest
== "name":
3353 self
._check
_name
(values
)
3354 setattr(namespace
, self
.dest
, values
)
3356 ##################################
3360 distro_version
= None
3361 distro_codename
= None
3362 with
open('/etc/os-release', 'r') as f
:
3363 for line
in f
.readlines():
3365 if '=' not in line
or line
.startswith('#'):
3367 (var
, val
) = line
.split('=', 1)
3368 if val
[0] == '"' and val
[-1] == '"':
3371 distro
= val
.lower()
3372 elif var
== 'VERSION_ID':
3373 distro_version
= val
.lower()
3374 elif var
== 'VERSION_CODENAME':
3375 distro_codename
= val
.lower()
3376 return distro
, distro_version
, distro_codename
3378 class Packager(object):
3379 def __init__(self
, stable
=None, version
=None, branch
=None, commit
=None):
3381 (stable
and not version
and not branch
and not commit
) or \
3382 (not stable
and version
and not branch
and not commit
) or \
3383 (not stable
and not version
and branch
) or \
3384 (not stable
and not version
and not branch
and not commit
)
3385 self
.stable
= stable
3386 self
.version
= version
3387 self
.branch
= branch
3388 self
.commit
= commit
3391 raise NotImplementedError
3394 raise NotImplementedError
3396 def query_shaman(self
, distro
, distro_version
, branch
, commit
):
3398 logging
.info('Fetching repo metadata from shaman and chacra...')
3399 shaman_url
= 'https://shaman.ceph.com/api/repos/ceph/{branch}/{sha1}/{distro}/{distro_version}/repo/?arch={arch}'.format(
3401 distro_version
=distro_version
,
3403 sha1
=commit
or 'latest',
3407 shaman_response
= urlopen(shaman_url
)
3408 except HTTPError
as err
:
3409 logging
.error('repository not found in shaman (might not be available yet)')
3410 raise Error('%s, failed to fetch %s' % (err
, shaman_url
))
3412 chacra_url
= shaman_response
.geturl()
3413 chacra_response
= urlopen(chacra_url
)
3414 except HTTPError
as err
:
3415 logging
.error('repository not found in chacra (might not be available yet)')
3416 raise Error('%s, failed to fetch %s' % (err
, chacra_url
))
3417 return chacra_response
.read().decode('utf-8')
3419 def repo_gpgkey(self
):
3422 if self
.stable
or self
.version
:
3423 return 'https://download.ceph.com/keys/release.asc', 'release'
3425 return 'https://download.ceph.com/keys/autobuild.asc', 'autobuild'
3427 def enable_service(self
, service
):
3429 Start and enable the service (typically using systemd).
3431 call_throws(['systemctl', 'enable', '--now', service
])
3434 class Apt(Packager
):
3440 def __init__(self
, stable
, version
, branch
, commit
,
3441 distro
, distro_version
, distro_codename
):
3442 super(Apt
, self
).__init
__(stable
=stable
, version
=version
,
3443 branch
=branch
, commit
=commit
)
3444 self
.distro
= self
.DISTRO_NAMES
[distro
]
3445 self
.distro_codename
= distro_codename
3447 def repo_path(self
):
3448 return '/etc/apt/sources.list.d/ceph.list'
3451 url
, name
= self
.repo_gpgkey()
3452 logging
.info('Installing repo GPG key from %s...' % url
)
3454 response
= urlopen(url
)
3455 except HTTPError
as err
:
3456 logging
.error('failed to fetch GPG repo key from %s: %s' % (
3458 raise Error('failed to fetch GPG key')
3459 key
= response
.read().decode('utf-8')
3460 with
open('/etc/apt/trusted.gpg.d/ceph.%s.gpg' % name
, 'w') as f
:
3464 content
= 'deb %s/debian-%s/ %s main\n' % (
3465 args
.repo_url
, self
.version
, self
.distro_codename
)
3467 content
= 'deb %s/debian-%s/ %s main\n' % (
3468 args
.repo_url
, self
.stable
, self
.distro_codename
)
3470 content
= self
.query_shaman(self
.distro
, self
.distro_codename
, self
.branch
,
3473 logging
.info('Installing repo file at %s...' % self
.repo_path())
3474 with
open(self
.repo_path(), 'w') as f
:
3478 for name
in ['autobuild', 'release']:
3479 p
= '/etc/apt/trusted.gpg.d/ceph.%s.gpg' % name
3480 if os
.path
.exists(p
):
3481 logging
.info('Removing repo GPG key %s...' % p
)
3483 if os
.path
.exists(self
.repo_path()):
3484 logging
.info('Removing repo at %s...' % self
.repo_path())
3485 os
.unlink(self
.repo_path())
3487 def install(self
, ls
):
3488 logging
.info('Installing packages %s...' % ls
)
3489 call_throws(['apt', 'install', '-y'] + ls
)
3491 def install_podman(self
):
3492 if self
.distro
== 'ubuntu':
3493 logging
.info('Setting up repo for pdoman...')
3494 self
.install(['software-properties-common'])
3495 call_throws(['add-apt-repository', '-y', 'ppa:projectatomic/ppa'])
3496 call_throws(['apt', 'update'])
3498 logging
.info('Attempting podman install...')
3500 self
.install(['podman'])
3502 logging
.info('Podman did not work. Falling back to docker...')
3503 self
.install(['docker.io'])
3505 class YumDnf(Packager
):
3507 'centos': ('centos', 'el'),
3508 'rhel': ('centos', 'el'),
3509 'scientific': ('centos', 'el'),
3510 'fedora': ('fedora', 'fc'),
3513 def __init__(self
, stable
, version
, branch
, commit
,
3514 distro
, distro_version
):
3515 super(YumDnf
, self
).__init
__(stable
=stable
, version
=version
,
3516 branch
=branch
, commit
=commit
)
3517 self
.major
= int(distro_version
.split('.')[0])
3518 self
.distro_normalized
= self
.DISTRO_NAMES
[distro
][0]
3519 self
.distro_code
= self
.DISTRO_NAMES
[distro
][1] + str(self
.major
)
3520 if (self
.distro_code
== 'fc' and self
.major
>= 30) or \
3521 (self
.distro_code
== 'el' and self
.major
>= 8):
3526 def custom_repo(self
, **kw
):
3528 Repo files need special care in that a whole line should not be present
3529 if there is no value for it. Because we were using `format()` we could
3530 not conditionally add a line for a repo file. So the end result would
3531 contain a key with a missing value (say if we were passing `None`).
3533 For example, it could look like::
3540 Which breaks. This function allows us to conditionally add lines,
3541 preserving an order and be more careful.
3543 Previously, and for historical purposes, this is how the template used
3559 # by using tuples (vs a dict) we preserve the order of what we want to
3560 # return, like starting with a [repo name]
3562 ('reponame', '[%s]'),
3563 ('name', 'name=%s'),
3564 ('baseurl', 'baseurl=%s'),
3565 ('enabled', 'enabled=%s'),
3566 ('gpgcheck', 'gpgcheck=%s'),
3567 ('_type', 'type=%s'),
3568 ('gpgkey', 'gpgkey=%s'),
3569 ('proxy', 'proxy=%s'),
3570 ('priority', 'priority=%s'),
3574 tmpl_key
, tmpl_value
= line
# key values from tmpl
3576 # ensure that there is an actual value (not None nor empty string)
3577 if tmpl_key
in kw
and kw
.get(tmpl_key
) not in (None, ''):
3578 lines
.append(tmpl_value
% kw
.get(tmpl_key
))
3580 return '\n'.join(lines
)
3582 def repo_path(self
):
3583 return '/etc/yum.repos.d/ceph.repo'
3585 def repo_baseurl(self
):
3586 assert self
.stable
or self
.version
3588 return '%s/rpm-%s/%s' % (args
.repo_url
, self
.version
,
3591 return '%s/rpm-%s/%s' % (args
.repo_url
, self
.stable
,
3595 if self
.stable
or self
.version
:
3598 'Ceph': '$basearch',
3599 'Ceph-noarch': 'noarch',
3600 'Ceph-source': 'SRPMS'}.items():
3601 content
+= '[%s]\n' % (n
)
3602 content
+= self
.custom_repo(
3604 baseurl
=self
.repo_baseurl() + '/' + t
,
3607 gpgkey
=self
.repo_gpgkey()[0],
3611 content
= self
.query_shaman(self
.distro_normalized
, self
.major
,
3615 logging
.info('Writing repo to %s...' % self
.repo_path())
3616 with
open(self
.repo_path(), 'w') as f
:
3619 if self
.distro_code
.startswith('el'):
3620 logger
.info('Enabling EPEL...')
3621 call_throws([self
.tool
, 'install', '-y', 'epel-release'])
3622 if self
.distro_code
== 'el8':
3623 # we also need Ken's copr repo, at least for now
3624 logger
.info('Enabling supplementary copr repo ktdreyer/ceph-el8...')
3625 call_throws(['dnf', 'copr', 'enable', '-y', 'ktdreyer/ceph-el8'])
3628 if os
.path
.exists(self
.repo_path()):
3629 os
.unlink(self
.repo_path())
3630 if self
.distro_code
== 'el8':
3631 logger
.info('Disabling supplementary copr repo ktdreyer/ceph-el8...')
3632 call_throws(['dnf', 'copr', 'disable', '-y', 'ktdreyer/ceph-el8'])
3634 def install(self
, ls
):
3635 logger
.info('Installing packages %s...' % ls
)
3636 call_throws([self
.tool
, 'install', '-y'] + ls
)
3638 def install_podman(self
):
3639 self
.install(['podman'])
3642 class Zypper(Packager
):
3645 'opensuse-tumbleweed',
3649 def __init__(self
, stable
, version
, branch
, commit
,
3650 distro
, distro_version
):
3651 super(Zypper
, self
).__init
__(stable
=stable
, version
=version
,
3652 branch
=branch
, commit
=commit
)
3653 self
.tool
= 'zypper'
3654 self
.distro
= 'opensuse'
3655 self
.distro_version
= '15.1'
3656 if 'tumbleweed' not in distro
and distro_version
is not None:
3657 self
.distro_version
= distro_version
3659 def custom_repo(self
, **kw
):
3661 See YumDnf for format explanation.
3665 # by using tuples (vs a dict) we preserve the order of what we want to
3666 # return, like starting with a [repo name]
3668 ('reponame', '[%s]'),
3669 ('name', 'name=%s'),
3670 ('baseurl', 'baseurl=%s'),
3671 ('enabled', 'enabled=%s'),
3672 ('gpgcheck', 'gpgcheck=%s'),
3673 ('_type', 'type=%s'),
3674 ('gpgkey', 'gpgkey=%s'),
3675 ('proxy', 'proxy=%s'),
3676 ('priority', 'priority=%s'),
3680 tmpl_key
, tmpl_value
= line
# key values from tmpl
3682 # ensure that there is an actual value (not None nor empty string)
3683 if tmpl_key
in kw
and kw
.get(tmpl_key
) not in (None, ''):
3684 lines
.append(tmpl_value
% kw
.get(tmpl_key
))
3686 return '\n'.join(lines
)
3688 def repo_path(self
):
3689 return '/etc/zypp/repos.d/ceph.repo'
3691 def repo_baseurl(self
):
3692 assert self
.stable
or self
.version
3694 return '%s/rpm-%s/%s' % (args
.repo_url
, self
.stable
, self
.distro
)
3696 return '%s/rpm-%s/%s' % (args
.repo_url
, self
.stable
, self
.distro
)
3699 if self
.stable
or self
.version
:
3702 'Ceph': '$basearch',
3703 'Ceph-noarch': 'noarch',
3704 'Ceph-source': 'SRPMS'}.items():
3705 content
+= '[%s]\n' % (n
)
3706 content
+= self
.custom_repo(
3708 baseurl
=self
.repo_baseurl() + '/' + t
,
3711 gpgkey
=self
.repo_gpgkey()[0],
3715 content
= self
.query_shaman(self
.distro
, self
.distro_version
,
3719 logging
.info('Writing repo to %s...' % self
.repo_path())
3720 with
open(self
.repo_path(), 'w') as f
:
3724 if os
.path
.exists(self
.repo_path()):
3725 os
.unlink(self
.repo_path())
3727 def install(self
, ls
):
3728 logger
.info('Installing packages %s...' % ls
)
3729 call_throws([self
.tool
, 'in', '-y'] + ls
)
3731 def install_podman(self
):
3732 self
.install(['podman'])
3735 def create_packager(stable
=None, version
=None, branch
=None, commit
=None):
3736 distro
, distro_version
, distro_codename
= get_distro()
3737 if distro
in YumDnf
.DISTRO_NAMES
:
3738 return YumDnf(stable
=stable
, version
=version
,
3739 branch
=branch
, commit
=commit
,
3740 distro
=distro
, distro_version
=distro_version
)
3741 elif distro
in Apt
.DISTRO_NAMES
:
3742 return Apt(stable
=stable
, version
=version
,
3743 branch
=branch
, commit
=commit
,
3744 distro
=distro
, distro_version
=distro_version
,
3745 distro_codename
=distro_codename
)
3746 elif distro
in Zypper
.DISTRO_NAMES
:
3747 return Zypper(stable
=stable
, version
=version
,
3748 branch
=branch
, commit
=commit
,
3749 distro
=distro
, distro_version
=distro_version
)
3750 raise Error('Distro %s version %s not supported' % (distro
, distro_version
))
3753 def command_add_repo():
3754 if args
.version
and args
.release
:
3755 raise Error('you can specify either --release or --version but not both')
3758 (x
, y
, z
) = args
.version
.split('.')
3759 except Exception as e
:
3760 raise Error('version must be in the form x.y.z (e.g., 15.2.0)')
3762 pkg
= create_packager(stable
=args
.release
,
3763 version
=args
.version
,
3765 commit
=args
.dev_commit
)
3768 def command_rm_repo():
3769 pkg
= create_packager()
3772 def command_install():
3773 pkg
= create_packager()
3774 pkg
.install(args
.packages
)
3776 ##################################
3779 # type: () -> argparse.ArgumentParser
3780 parser
= argparse
.ArgumentParser(
3781 description
='Bootstrap Ceph daemons with systemd and containers.',
3782 formatter_class
=argparse
.ArgumentDefaultsHelpFormatter
)
3783 parser
.add_argument(
3785 help='container image. Can also be set via the "CEPHADM_IMAGE" '
3787 parser
.add_argument(
3789 action
='store_true',
3790 help='use docker instead of podman')
3791 parser
.add_argument(
3794 help='base directory for daemon data')
3795 parser
.add_argument(
3798 help='base directory for daemon logs')
3799 parser
.add_argument(
3801 default
=LOGROTATE_DIR
,
3802 help='location of logrotate configuration files')
3803 parser
.add_argument(
3806 help='base directory for systemd units')
3807 parser
.add_argument(
3809 action
='store_true',
3810 help='Show debug-level log messages')
3811 parser
.add_argument(
3814 default
=DEFAULT_TIMEOUT
,
3815 help='timeout in seconds')
3816 parser
.add_argument(
3819 default
=DEFAULT_RETRY
,
3820 help='max number of retries')
3822 subparsers
= parser
.add_subparsers(help='sub-command')
3824 parser_version
= subparsers
.add_parser(
3825 'version', help='get ceph version from container')
3826 parser_version
.set_defaults(func
=command_version
)
3828 parser_pull
= subparsers
.add_parser(
3829 'pull', help='pull latest image version')
3830 parser_pull
.set_defaults(func
=command_pull
)
3832 parser_inspect_image
= subparsers
.add_parser(
3833 'inspect-image', help='inspect local container image')
3834 parser_inspect_image
.set_defaults(func
=command_inspect_image
)
3836 parser_ls
= subparsers
.add_parser(
3837 'ls', help='list daemon instances on this host')
3838 parser_ls
.set_defaults(func
=command_ls
)
3839 parser_ls
.add_argument(
3841 action
='store_true',
3842 help='Do not include daemon status')
3843 parser_ls
.add_argument(
3846 help='base directory for legacy daemon data')
3848 parser_list_networks
= subparsers
.add_parser(
3849 'list-networks', help='list IP networks')
3850 parser_list_networks
.set_defaults(func
=command_list_networks
)
3852 parser_adopt
= subparsers
.add_parser(
3853 'adopt', help='adopt daemon deployed with a different tool')
3854 parser_adopt
.set_defaults(func
=command_adopt
)
3855 parser_adopt
.add_argument(
3858 help='daemon name (type.id)')
3859 parser_adopt
.add_argument(
3862 help='deployment style (legacy, ...)')
3863 parser_adopt
.add_argument(
3866 help='cluster name')
3867 parser_adopt
.add_argument(
3870 help='base directory for legacy daemon data')
3871 parser_adopt
.add_argument(
3873 help='Additional configuration information in JSON format')
3874 parser_adopt
.add_argument(
3876 action
='store_true',
3877 help='Do not configure firewalld')
3878 parser_adopt
.add_argument(
3880 action
='store_true',
3881 help='do not pull the latest image before adopting')
3883 parser_rm_daemon
= subparsers
.add_parser(
3884 'rm-daemon', help='remove daemon instance')
3885 parser_rm_daemon
.set_defaults(func
=command_rm_daemon
)
3886 parser_rm_daemon
.add_argument(
3889 action
=CustomValidation
,
3890 help='daemon name (type.id)')
3891 parser_rm_daemon
.add_argument(
3894 help='cluster FSID')
3895 parser_rm_daemon
.add_argument(
3897 action
='store_true',
3898 help='proceed, even though this may destroy valuable data')
3899 parser_rm_daemon
.add_argument(
3900 '--force-delete-data',
3901 action
='store_true',
3902 help='delete valuable daemon data instead of making a backup')
3904 parser_rm_cluster
= subparsers
.add_parser(
3905 'rm-cluster', help='remove all daemons for a cluster')
3906 parser_rm_cluster
.set_defaults(func
=command_rm_cluster
)
3907 parser_rm_cluster
.add_argument(
3910 help='cluster FSID')
3911 parser_rm_cluster
.add_argument(
3913 action
='store_true',
3914 help='proceed, even though this may destroy valuable data')
3916 parser_run
= subparsers
.add_parser(
3917 'run', help='run a ceph daemon, in a container, in the foreground')
3918 parser_run
.set_defaults(func
=command_run
)
3919 parser_run
.add_argument(
3922 help='daemon name (type.id)')
3923 parser_run
.add_argument(
3926 help='cluster FSID')
3928 parser_shell
= subparsers
.add_parser(
3929 'shell', help='run an interactive shell inside a daemon container')
3930 parser_shell
.set_defaults(func
=command_shell
)
3931 parser_shell
.add_argument(
3933 help='cluster FSID')
3934 parser_shell
.add_argument(
3936 help='daemon name (type.id)')
3937 parser_shell
.add_argument(
3939 help='ceph.conf to pass through to the container')
3940 parser_shell
.add_argument(
3942 help='ceph.keyring to pass through to the container')
3943 parser_shell
.add_argument(
3947 help='set environment variable')
3948 parser_shell
.add_argument(
3949 'command', nargs
='*',
3950 help='command (optional)')
3952 parser_enter
= subparsers
.add_parser(
3953 'enter', help='run an interactive shell inside a running daemon container')
3954 parser_enter
.set_defaults(func
=command_enter
)
3955 parser_enter
.add_argument(
3957 help='cluster FSID')
3958 parser_enter
.add_argument(
3961 help='daemon name (type.id)')
3962 parser_enter
.add_argument(
3963 'command', nargs
='*',
3966 parser_ceph_volume
= subparsers
.add_parser(
3967 'ceph-volume', help='run ceph-volume inside a container')
3968 parser_ceph_volume
.set_defaults(func
=command_ceph_volume
)
3969 parser_ceph_volume
.add_argument(
3971 help='cluster FSID')
3972 parser_ceph_volume
.add_argument(
3974 help='JSON file with config and (client.bootrap-osd) key')
3975 parser_ceph_volume
.add_argument(
3976 'command', nargs
='+',
3979 parser_unit
= subparsers
.add_parser(
3980 'unit', help='operate on the daemon\'s systemd unit')
3981 parser_unit
.set_defaults(func
=command_unit
)
3982 parser_unit
.add_argument(
3984 help='systemd command (start, stop, restart, enable, disable, ...)')
3985 parser_unit
.add_argument(
3987 help='cluster FSID')
3988 parser_unit
.add_argument(
3991 help='daemon name (type.id)')
3993 parser_logs
= subparsers
.add_parser(
3994 'logs', help='print journald logs for a daemon container')
3995 parser_logs
.set_defaults(func
=command_logs
)
3996 parser_logs
.add_argument(
3998 help='cluster FSID')
3999 parser_logs
.add_argument(
4002 help='daemon name (type.id)')
4003 parser_logs
.add_argument(
4004 'command', nargs
='*',
4005 help='additional journalctl args')
4007 parser_bootstrap
= subparsers
.add_parser(
4008 'bootstrap', help='bootstrap a cluster (mon + mgr daemons)')
4009 parser_bootstrap
.set_defaults(func
=command_bootstrap
)
4010 parser_bootstrap
.add_argument(
4012 help='ceph conf file to incorporate')
4013 parser_bootstrap
.add_argument(
4016 help='mon id (default: local hostname)')
4017 parser_bootstrap
.add_argument(
4019 help='mon IPs (e.g., [v2:localipaddr:3300,v1:localipaddr:6789])')
4020 parser_bootstrap
.add_argument(
4023 parser_bootstrap
.add_argument(
4026 help='mgr id (default: randomly generated)')
4027 parser_bootstrap
.add_argument(
4029 help='cluster FSID')
4030 parser_bootstrap
.add_argument(
4032 default
='/etc/ceph',
4033 help='directory to write config, keyring, and pub key files')
4034 parser_bootstrap
.add_argument(
4036 help='location to write keyring file with new cluster admin and mon keys')
4037 parser_bootstrap
.add_argument(
4039 help='location to write conf file to connect to new cluster')
4040 parser_bootstrap
.add_argument(
4041 '--output-pub-ssh-key',
4042 help='location to write the cluster\'s public SSH key')
4043 parser_bootstrap
.add_argument(
4045 action
='store_true',
4046 help='skip setup of ssh key on local host')
4047 parser_bootstrap
.add_argument(
4048 '--initial-dashboard-user',
4050 help='Initial user for the dashboard')
4051 parser_bootstrap
.add_argument(
4052 '--initial-dashboard-password',
4053 help='Initial password for the initial dashboard user')
4055 parser_bootstrap
.add_argument(
4057 help='Dashboard key')
4058 parser_bootstrap
.add_argument(
4060 help='Dashboard certificate')
4062 parser_bootstrap
.add_argument(
4063 '--skip-mon-network',
4064 action
='store_true',
4065 help='set mon public_network based on bootstrap mon ip')
4066 parser_bootstrap
.add_argument(
4068 action
='store_true',
4069 help='do not enable the Ceph Dashboard')
4070 parser_bootstrap
.add_argument(
4071 '--dashboard-password-noupdate',
4072 action
='store_true',
4073 help='stop forced dashboard password change')
4074 parser_bootstrap
.add_argument(
4075 '--no-minimize-config',
4076 action
='store_true',
4077 help='do not assimilate and minimize the config file')
4078 parser_bootstrap
.add_argument(
4079 '--skip-ping-check',
4080 action
='store_true',
4081 help='do not verify that mon IP is pingable')
4082 parser_bootstrap
.add_argument(
4084 action
='store_true',
4085 help='do not pull the latest image before bootstrapping')
4086 parser_bootstrap
.add_argument(
4088 action
='store_true',
4089 help='Do not configure firewalld')
4090 parser_bootstrap
.add_argument(
4091 '--allow-overwrite',
4092 action
='store_true',
4093 help='allow overwrite of existing --output-* config/keyring/ssh files')
4094 parser_bootstrap
.add_argument(
4095 '--allow-fqdn-hostname',
4096 action
='store_true',
4097 help='allow hostname that is fully-qualified (contains ".")')
4098 parser_bootstrap
.add_argument(
4099 '--skip-prepare-host',
4100 action
='store_true',
4101 help='Do not prepare host')
4102 parser_bootstrap
.add_argument(
4103 '--orphan-initial-daemons',
4104 action
='store_true',
4105 help='Do not create initial mon, mgr, and crash service specs')
4106 parser_bootstrap
.add_argument(
4107 '--skip-monitoring-stack',
4108 action
='store_true',
4109 help='Do not automatically provision monitoring stack (prometheus, grafana, alertmanager, node-exporter)')
4111 parser_deploy
= subparsers
.add_parser(
4112 'deploy', help='deploy a daemon')
4113 parser_deploy
.set_defaults(func
=command_deploy
)
4114 parser_deploy
.add_argument(
4117 action
=CustomValidation
,
4118 help='daemon name (type.id)')
4119 parser_deploy
.add_argument(
4122 help='cluster FSID')
4123 parser_deploy
.add_argument(
4125 help='config file for new daemon')
4126 parser_deploy
.add_argument(
4128 help='Additional configuration information in JSON format')
4129 parser_deploy
.add_argument(
4131 help='keyring for new daemon')
4132 parser_deploy
.add_argument(
4134 help='key for new daemon')
4135 parser_deploy
.add_argument(
4137 help='OSD uuid, if creating an OSD container')
4138 parser_deploy
.add_argument(
4140 action
='store_true',
4141 help='Do not configure firewalld')
4142 parser_deploy
.add_argument(
4144 action
='store_true',
4145 help='Reconfigure a previously deployed daemon')
4146 parser_deploy
.add_argument(
4148 action
='store_true',
4149 help='Allow SYS_PTRACE on daemon container')
4151 parser_check_host
= subparsers
.add_parser(
4152 'check-host', help='check host configuration')
4153 parser_check_host
.set_defaults(func
=command_check_host
)
4154 parser_check_host
.add_argument(
4155 '--expect-hostname',
4156 help='Check that hostname matches an expected value')
4158 parser_prepare_host
= subparsers
.add_parser(
4159 'prepare-host', help='prepare a host for cephadm use')
4160 parser_prepare_host
.set_defaults(func
=command_prepare_host
)
4161 parser_prepare_host
.add_argument(
4162 '--expect-hostname',
4163 help='Set hostname')
4165 parser_add_repo
= subparsers
.add_parser(
4166 'add-repo', help='configure package repository')
4167 parser_add_repo
.set_defaults(func
=command_add_repo
)
4168 parser_add_repo
.add_argument(
4170 help='use latest version of a named release (e.g., octopus)')
4171 parser_add_repo
.add_argument(
4173 help='use specific upstream version (x.y.z)')
4174 parser_add_repo
.add_argument(
4176 help='use specified bleeding edge build from git branch or tag')
4177 parser_add_repo
.add_argument(
4179 help='use specified bleeding edge build from git commit')
4180 parser_add_repo
.add_argument(
4182 help='specify alternative GPG key location')
4183 parser_add_repo
.add_argument(
4185 default
='https://download.ceph.com',
4186 help='specify alternative repo location')
4189 parser_rm_repo
= subparsers
.add_parser(
4190 'rm-repo', help='remove package repository configuration')
4191 parser_rm_repo
.set_defaults(func
=command_rm_repo
)
4193 parser_install
= subparsers
.add_parser(
4194 'install', help='install ceph package(s)')
4195 parser_install
.set_defaults(func
=command_install
)
4196 parser_install
.add_argument(
4197 'packages', nargs
='*',
4198 default
=['cephadm'],
4203 def _parse_args(av
):
4204 parser
= _get_parser()
4205 return parser
.parse_args(av
)
4207 if __name__
== "__main__":
4208 # allow argv to be injected
4210 av
= injected_argv
# type: ignore
4213 args
= _parse_args(av
)
4216 logging
.basicConfig(level
=logging
.DEBUG
)
4218 logging
.basicConfig(level
=logging
.INFO
)
4219 logger
= logging
.getLogger('cephadm')
4222 if os
.geteuid() != 0:
4223 sys
.stderr
.write('ERROR: cephadm should be run as root\n')
4228 container_path
= find_program('docker')
4230 for i
in CONTAINER_PREFERENCE
:
4232 container_path
= find_program(i
)
4234 except Exception as e
:
4235 logger
.debug('Could not locate %s: %s' % (i
, e
))
4236 if not container_path
and args
.func
!= command_prepare_host
:
4237 sys
.stderr
.write('Unable to locate any of %s\n' % CONTAINER_PREFERENCE
)
4240 if 'func' not in args
:
4241 sys
.stderr
.write('No command specified; pass -h or --help for usage\n')
4249 sys
.stderr
.write('ERROR: %s\n' % e
)