]> git.proxmox.com Git - ceph.git/blob - ceph/src/cephadm/cephadm
import 15.2.0 Octopus source
[ceph.git] / ceph / src / cephadm / cephadm
1 #!/usr/bin/python3
2
3 DEFAULT_IMAGE='docker.io/ceph/ceph:v15'
4 DATA_DIR='/var/lib/ceph'
5 LOG_DIR='/var/log/ceph'
6 LOCK_DIR='/run/cephadm'
7 LOGROTATE_DIR='/etc/logrotate.d'
8 UNIT_DIR='/etc/systemd/system'
9 LOG_DIR_MODE=0o770
10 DATA_DIR_MODE=0o700
11 CONTAINER_PREFERENCE = ['podman', 'docker'] # prefer podman to docker
12 CUSTOM_PS1=r'[ceph: \u@\h \W]\$ '
13 DEFAULT_TIMEOUT=None # in seconds
14 DEFAULT_RETRY=10
15 SHELL_DEFAULT_CONF='/etc/ceph/ceph.conf'
16 SHELL_DEFAULT_KEYRING='/etc/ceph/ceph.client.admin.keyring'
17
18 """
19 You can invoke cephadm in two ways:
20
21 1. The normal way, at the command line.
22
23 2. By piping the script to the python3 binary. In this latter case, you should
24 prepend one or more lines to the beginning of the script.
25
26 For arguments,
27
28 injected_argv = [...]
29
30 e.g.,
31
32 injected_argv = ['ls']
33
34 For reading stdin from the '--config-json -' argument,
35
36 injected_stdin = '...'
37 """
38
39 import argparse
40 import datetime
41 import fcntl
42 import json
43 import logging
44 import os
45 import platform
46 import random
47 import re
48 import select
49 import shutil
50 import socket
51 import string
52 import subprocess
53 import sys
54 import tempfile
55 import time
56 import errno
57 try:
58 from typing import Dict, List, Tuple, Optional, Union, Any, NoReturn, Callable
59 except ImportError:
60 pass
61 import uuid
62
63 from functools import wraps
64 from glob import glob
65 from threading import Thread
66
67 if sys.version_info >= (3, 0):
68 from io import StringIO
69 else:
70 from StringIO import StringIO
71
72 if sys.version_info >= (3, 2):
73 from configparser import ConfigParser
74 else:
75 from ConfigParser import SafeConfigParser
76
77 if sys.version_info >= (3, 0):
78 from urllib.request import urlopen
79 from urllib.error import HTTPError
80 else:
81 from urllib2 import urlopen, HTTPError
82
83 container_path = ''
84 cached_stdin = None
85
86 DATEFMT = '%Y-%m-%dT%H:%M:%S.%f'
87
88 class Error(Exception):
89 pass
90
91 class TimeoutExpired(Error):
92 pass
93
94 ##################################
95
96 class Ceph(object):
97 daemons = ('mon', 'mgr', 'mds', 'osd', 'rgw', 'rbd-mirror',
98 'crash')
99
100 ##################################
101
102 class Monitoring(object):
103 """Define the configs for the monitoring containers"""
104
105 port_map = {
106 "prometheus": [9095], # Avoid default 9090, due to conflict with cockpit UI
107 "node-exporter": [9100],
108 "grafana": [3000],
109 "alertmanager": [9093, 9094],
110 }
111
112 components = {
113 "prometheus": {
114 "image": "prom/prometheus:latest",
115 "cpus": '2',
116 "memory": '4GB',
117 "args": [
118 "--config.file=/etc/prometheus/prometheus.yml",
119 "--storage.tsdb.path=/prometheus",
120 "--web.listen-address=:{}".format(port_map['prometheus'][0]),
121 ],
122 "config-json-files": [
123 "prometheus.yml",
124 ],
125 },
126 "node-exporter": {
127 "image": "prom/node-exporter",
128 "cpus": "1",
129 "memory": "1GB",
130 "args": [
131 "--no-collector.timex",
132 ],
133 },
134 "grafana": {
135 "image": "ceph/ceph-grafana:latest",
136 "cpus": "2",
137 "memory": "4GB",
138 "args": [],
139 "config-json-files": [
140 "grafana.ini",
141 "provisioning/datasources/ceph-dashboard.yml",
142 "certs/cert_file",
143 "certs/cert_key",
144 ],
145 },
146 "alertmanager": {
147 "image": "prom/alertmanager",
148 "cpus": "2",
149 "memory": "2GB",
150 "args": [],
151 "config-json-files": [
152 "alertmanager.yml",
153 ],
154 "config-json-args": [
155 "peers",
156 ],
157 },
158 } # type: ignore
159
160 ##################################
161
162 class NFSGanesha(object):
163 """Defines a NFS-Ganesha container"""
164
165 daemon_type = 'nfs'
166 entrypoint = '/usr/bin/ganesha.nfsd'
167 daemon_args = ['-F', '-L', 'STDERR']
168
169 required_files = ['ganesha.conf']
170
171 port_map = {
172 "nfs" : 2049,
173 }
174
175 def __init__(self,
176 fsid,
177 daemon_id,
178 config_json,
179 image=DEFAULT_IMAGE):
180 # type: (str, Union[int, str], Dict, str) -> None
181 self.fsid = fsid
182 self.daemon_id = daemon_id
183 self.image = image
184
185 def json_get(key, default=None, require=False):
186 if require and not key in config_json.keys():
187 raise Error('{} missing from config-json'.format(key))
188 return config_json.get(key, default)
189
190 # config-json options
191 self.pool = json_get('pool', require=True)
192 self.namespace = json_get('namespace')
193 self.files = json_get('files', {})
194
195 # validate the supplied args
196 self.validate()
197
198 @classmethod
199 def init(cls, fsid, daemon_id):
200 # type: (str, Union[int, str]) -> NFSGanesha
201 return cls(fsid, daemon_id, get_parm(args.config_json), args.image)
202
203 @staticmethod
204 def port_in_use():
205 # type () -> None
206 for (srv, port) in NFSGanesha.port_map.items():
207 if port_in_use(port):
208 msg = 'TCP port {} required for {} is already in use'.format(port, srv)
209 raise Error(msg)
210
211 @staticmethod
212 def get_container_mounts(data_dir):
213 # type: (str) -> Dict[str, str]
214 mounts = dict()
215 mounts[os.path.join(data_dir, 'config')] = '/etc/ceph/ceph.conf:z'
216 mounts[os.path.join(data_dir, 'keyring')] = '/etc/ceph/keyring:z'
217 mounts[os.path.join(data_dir, 'etc/ganesha')] = '/etc/ganesha:z'
218 return mounts
219
220 @staticmethod
221 def get_container_envs():
222 # type: () -> List[str]
223 envs = [
224 'CEPH_CONF=%s' % ('/etc/ceph/ceph.conf')
225 ]
226 return envs
227
228 @staticmethod
229 def get_version(container_id):
230 # type(str) -> Optional[str]
231 version = None
232 out, err, code = call(
233 [container_path, 'exec', container_id,
234 NFSGanesha.entrypoint, '-v'])
235 if code == 0:
236 match = re.search(r'NFS-Ganesha Release\s*=\s*[V]*([\d.]+)', out)
237 if match:
238 version = match.group(1)
239 return version
240
241 def validate(self):
242 # type () -> None
243 if not is_fsid(self.fsid):
244 raise Error('not an fsid: %s' % self.fsid)
245 if not self.daemon_id:
246 raise Error('invalid daemon_id: %s' % self.daemon_id)
247 if not self.image:
248 raise Error('invalid image: %s' % self.image)
249
250 # check for the required files
251 if self.required_files:
252 for fname in self.required_files:
253 if fname not in self.files:
254 raise Error('required file missing from config-json: %s' % fname)
255
256 def get_daemon_name(self):
257 # type: () -> str
258 return '%s.%s' % (self.daemon_type, self.daemon_id)
259
260 def get_container_name(self, desc=None):
261 # type: (Optional[str]) -> str
262 cname = 'ceph-%s-%s' % (self.fsid, self.get_daemon_name())
263 if desc:
264 cname = '%s-%s' % (cname, desc)
265 return cname
266
267 def get_file_content(self, fname):
268 # type: (str) -> str
269 """Normalize the json file content into a string"""
270 content = self.files.get(fname)
271 if isinstance(content, list):
272 content = '\n'.join(content)
273 return content
274
275 def create_daemon_dirs(self, data_dir, uid, gid):
276 # type: (str, int, int) -> None
277 """Create files under the container data dir"""
278 if not os.path.isdir(data_dir):
279 raise OSError('data_dir is not a directory: %s' % (data_dir))
280
281 logger.info('Creating ganesha config...')
282
283 # create the ganesha conf dir
284 config_dir = os.path.join(data_dir, 'etc/ganesha')
285 makedirs(config_dir, uid, gid, 0o755)
286
287 # populate files from the config-json
288 for fname in self.files:
289 config_file = os.path.join(config_dir, fname)
290 config_content = self.get_file_content(fname)
291 logger.info('Write file: %s' % (config_file))
292 with open(config_file, 'w') as f:
293 os.fchown(f.fileno(), uid, gid)
294 os.fchmod(f.fileno(), 0o600)
295 f.write(config_content)
296
297 def get_rados_grace_container(self, action):
298 # type: (str) -> CephContainer
299 """Container for a ganesha action on the grace db"""
300 entrypoint = '/usr/bin/ganesha-rados-grace'
301
302 assert self.pool
303 args=['--pool', self.pool]
304 if self.namespace:
305 args += ['--ns', self.namespace]
306 args += [action, self.get_daemon_name()]
307
308 data_dir = get_data_dir(self.fsid, self.daemon_type, self.daemon_id)
309 volume_mounts = self.get_container_mounts(data_dir)
310 envs = self.get_container_envs()
311
312 logger.info('Creating RADOS grace for action: %s' % (action))
313 c = CephContainer(
314 image=self.image,
315 entrypoint=entrypoint,
316 args=args,
317 volume_mounts=volume_mounts,
318 cname=self.get_container_name(desc='grace-%s' % (action)),
319 envs=envs
320 )
321 return c
322
323 ##################################
324
325 def get_supported_daemons():
326 supported_daemons = list(Ceph.daemons)
327 supported_daemons.extend(Monitoring.components)
328 supported_daemons.append(NFSGanesha.daemon_type)
329 assert len(supported_daemons) == len(set(supported_daemons))
330 return supported_daemons
331
332 ##################################
333
334 def attempt_bind(s, address, port):
335 # type (str) -> None
336 try:
337 s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
338 s.bind((address, port))
339 except (socket.error, OSError) as e: # py2 and py3
340 msg = 'Cannot bind to IP %s port %d: %s' % (address, port, e)
341 logger.warning(msg)
342 if e.errno == errno.EADDRINUSE:
343 raise OSError(msg)
344 elif e.errno == errno.EADDRNOTAVAIL:
345 pass
346 finally:
347 s.close()
348
349 def port_in_use(port_num):
350 # type (int) -> bool
351 """Detect whether a port is in use on the local machine - IPv4 and IPv6"""
352 logger.info('Verifying port %d ...' % (port_num))
353 try:
354 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
355 attempt_bind(s, '0.0.0.0', port_num)
356
357 s = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
358 attempt_bind(s, '::', port_num)
359 except OSError:
360 return True
361 else:
362 return False
363
364 def check_ip_port(ip, port):
365 if not args.skip_ping_check:
366 logger.info('Verifying IP %s port %d ...' % (ip, port))
367 if ip.startswith('[') or '::' in ip:
368 s = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
369 else:
370 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
371 try:
372 attempt_bind(s, ip, port)
373 except OSError as e:
374 raise Error(e)
375
376 ##################################
377
378 # this is an abbreviated version of
379 # https://github.com/benediktschmitt/py-filelock/blob/master/filelock.py
380 # that drops all of the compatibility (this is Unix/Linux only).
381
382 try:
383 TimeoutError
384 except NameError:
385 TimeoutError = OSError
386
387 class Timeout(TimeoutError):
388 """
389 Raised when the lock could not be acquired in *timeout*
390 seconds.
391 """
392
393 def __init__(self, lock_file):
394 """
395 """
396 #: The path of the file lock.
397 self.lock_file = lock_file
398 return None
399
400 def __str__(self):
401 temp = "The file lock '{}' could not be acquired."\
402 .format(self.lock_file)
403 return temp
404
405
406 class _Acquire_ReturnProxy(object):
407 def __init__(self, lock):
408 self.lock = lock
409 return None
410
411 def __enter__(self):
412 return self.lock
413
414 def __exit__(self, exc_type, exc_value, traceback):
415 self.lock.release()
416 return None
417
418
419 class FileLock(object):
420 def __init__(self, name, timeout = -1):
421 if not os.path.exists(LOCK_DIR):
422 os.mkdir(LOCK_DIR, 0o700)
423 self._lock_file = os.path.join(LOCK_DIR, name + '.lock')
424
425 # The file descriptor for the *_lock_file* as it is returned by the
426 # os.open() function.
427 # This file lock is only NOT None, if the object currently holds the
428 # lock.
429 self._lock_file_fd = None
430 self.timeout = timeout
431 # The lock counter is used for implementing the nested locking
432 # mechanism. Whenever the lock is acquired, the counter is increased and
433 # the lock is only released, when this value is 0 again.
434 self._lock_counter = 0
435 return None
436
437 @property
438 def is_locked(self):
439 return self._lock_file_fd is not None
440
441 def acquire(self, timeout=None, poll_intervall=0.05):
442 """
443 Acquires the file lock or fails with a :exc:`Timeout` error.
444 .. code-block:: python
445 # You can use this method in the context manager (recommended)
446 with lock.acquire():
447 pass
448 # Or use an equivalent try-finally construct:
449 lock.acquire()
450 try:
451 pass
452 finally:
453 lock.release()
454 :arg float timeout:
455 The maximum time waited for the file lock.
456 If ``timeout < 0``, there is no timeout and this method will
457 block until the lock could be acquired.
458 If ``timeout`` is None, the default :attr:`~timeout` is used.
459 :arg float poll_intervall:
460 We check once in *poll_intervall* seconds if we can acquire the
461 file lock.
462 :raises Timeout:
463 if the lock could not be acquired in *timeout* seconds.
464 .. versionchanged:: 2.0.0
465 This method returns now a *proxy* object instead of *self*,
466 so that it can be used in a with statement without side effects.
467 """
468 # Use the default timeout, if no timeout is provided.
469 if timeout is None:
470 timeout = self.timeout
471
472 # Increment the number right at the beginning.
473 # We can still undo it, if something fails.
474 self._lock_counter += 1
475
476 lock_id = id(self)
477 lock_filename = self._lock_file
478 start_time = time.time()
479 try:
480 while True:
481 if not self.is_locked:
482 logger.debug('Acquiring lock %s on %s', lock_id,
483 lock_filename)
484 self._acquire()
485
486 if self.is_locked:
487 logger.debug('Lock %s acquired on %s', lock_id,
488 lock_filename)
489 break
490 elif timeout >= 0 and time.time() - start_time > timeout:
491 logger.warning('Timeout acquiring lock %s on %s', lock_id,
492 lock_filename)
493 raise Timeout(self._lock_file)
494 else:
495 logger.debug(
496 'Lock %s not acquired on %s, waiting %s seconds ...',
497 lock_id, lock_filename, poll_intervall
498 )
499 time.sleep(poll_intervall)
500 except:
501 # Something did go wrong, so decrement the counter.
502 self._lock_counter = max(0, self._lock_counter - 1)
503
504 raise
505 return _Acquire_ReturnProxy(lock = self)
506
507 def release(self, force = False):
508 """
509 Releases the file lock.
510 Please note, that the lock is only completly released, if the lock
511 counter is 0.
512 Also note, that the lock file itself is not automatically deleted.
513 :arg bool force:
514 If true, the lock counter is ignored and the lock is released in
515 every case.
516 """
517 if self.is_locked:
518 self._lock_counter -= 1
519
520 if self._lock_counter == 0 or force:
521 lock_id = id(self)
522 lock_filename = self._lock_file
523
524 logger.debug('Releasing lock %s on %s', lock_id, lock_filename)
525 self._release()
526 self._lock_counter = 0
527 logger.debug('Lock %s released on %s', lock_id, lock_filename)
528
529 return None
530
531 def __enter__(self):
532 self.acquire()
533 return self
534
535 def __exit__(self, exc_type, exc_value, traceback):
536 self.release()
537 return None
538
539 def __del__(self):
540 self.release(force = True)
541 return None
542
543
544 def _acquire(self):
545 open_mode = os.O_RDWR | os.O_CREAT | os.O_TRUNC
546 fd = os.open(self._lock_file, open_mode)
547
548 try:
549 fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
550 except (IOError, OSError):
551 os.close(fd)
552 else:
553 self._lock_file_fd = fd
554 return None
555
556 def _release(self):
557 # Do not remove the lockfile:
558 #
559 # https://github.com/benediktschmitt/py-filelock/issues/31
560 # https://stackoverflow.com/questions/17708885/flock-removing-locked-file-without-race-condition
561 fd = self._lock_file_fd
562 self._lock_file_fd = None
563 fcntl.flock(fd, fcntl.LOCK_UN)
564 os.close(fd)
565 return None
566
567
568 ##################################
569 # Popen wrappers, lifted from ceph-volume
570
571 def call(command, # type: List[str]
572 desc=None, # type: Optional[str]
573 verbose=False, # type: bool
574 verbose_on_failure=True, # type: bool
575 timeout=DEFAULT_TIMEOUT, # type: Optional[int]
576 **kwargs):
577 """
578 Wrap subprocess.Popen to
579
580 - log stdout/stderr to a logger,
581 - decode utf-8
582 - cleanly return out, err, returncode
583
584 If verbose=True, log at info (instead of debug) level.
585
586 :param verbose_on_failure: On a non-zero exit status, it will forcefully set
587 logging ON for the terminal
588 :param timeout: timeout in seconds
589 """
590 if not desc:
591 desc = command[0]
592 timeout = timeout or args.timeout
593
594 logger.debug("Running command: %s" % ' '.join(command))
595 process = subprocess.Popen(
596 command,
597 stdout=subprocess.PIPE,
598 stderr=subprocess.PIPE,
599 close_fds=True,
600 **kwargs
601 )
602 # get current p.stdout flags, add O_NONBLOCK
603 assert process.stdout is not None
604 assert process.stderr is not None
605 stdout_flags = fcntl.fcntl(process.stdout, fcntl.F_GETFL)
606 stderr_flags = fcntl.fcntl(process.stderr, fcntl.F_GETFL)
607 fcntl.fcntl(process.stdout, fcntl.F_SETFL, stdout_flags | os.O_NONBLOCK)
608 fcntl.fcntl(process.stderr, fcntl.F_SETFL, stderr_flags | os.O_NONBLOCK)
609
610 out = ''
611 err = ''
612 reads = None
613 stop = False
614 out_buffer = '' # partial line (no newline yet)
615 err_buffer = '' # partial line (no newline yet)
616 start_time = time.time()
617 end_time = None
618 if timeout:
619 end_time = start_time + timeout
620 while not stop:
621 if end_time and (time.time() >= end_time):
622 logger.info(desc + ':timeout after %s seconds' % timeout)
623 stop = True
624 process.kill()
625 if reads and process.poll() is not None:
626 # we want to stop, but first read off anything remaining
627 # on stdout/stderr
628 stop = True
629 else:
630 reads, _, _ = select.select(
631 [process.stdout.fileno(), process.stderr.fileno()],
632 [], [], timeout
633 )
634 for fd in reads:
635 try:
636 message_b = os.read(fd, 1024)
637 if isinstance(message_b, bytes):
638 message = message_b.decode('utf-8')
639 if isinstance(message_b, str):
640 message = message_b
641 if fd == process.stdout.fileno():
642 out += message
643 message = out_buffer + message
644 lines = message.split('\n')
645 out_buffer = lines.pop()
646 for line in lines:
647 if verbose:
648 logger.info(desc + ':stdout ' + line)
649 else:
650 logger.debug(desc + ':stdout ' + line)
651 elif fd == process.stderr.fileno():
652 err += message
653 message = err_buffer + message
654 lines = message.split('\n')
655 err_buffer = lines.pop()
656 for line in lines:
657 if verbose:
658 logger.info(desc + ':stderr ' + line)
659 else:
660 logger.debug(desc + ':stderr ' + line)
661 else:
662 assert False
663 except (IOError, OSError):
664 pass
665
666 returncode = process.wait()
667
668 if out_buffer != '':
669 if verbose:
670 logger.info(desc + ':stdout ' + out_buffer)
671 else:
672 logger.debug(desc + ':stdout ' + out_buffer)
673 if err_buffer != '':
674 if verbose:
675 logger.info(desc + ':stderr ' + err_buffer)
676 else:
677 logger.debug(desc + ':stderr ' + err_buffer)
678
679 if returncode != 0 and verbose_on_failure and not verbose:
680 # dump stdout + stderr
681 logger.info('Non-zero exit code %d from %s' % (returncode, ' '.join(command)))
682 for line in out.splitlines():
683 logger.info(desc + ':stdout ' + line)
684 for line in err.splitlines():
685 logger.info(desc + ':stderr ' + line)
686
687 return out, err, returncode
688
689
690 def call_throws(command, **kwargs):
691 # type: (List[str], Any) -> Tuple[str, str, int]
692 out, err, ret = call(command, **kwargs)
693 if ret:
694 raise RuntimeError('Failed command: %s' % ' '.join(command))
695 return out, err, ret
696
697
698 def call_timeout(command, timeout):
699 # type: (List[str], int) -> int
700
701 logger.debug('Running command (timeout=%s): %s'
702 % (timeout, ' '.join(command)))
703
704 def raise_timeout(command, timeout):
705 # type: (List[str], int) -> NoReturn
706 msg = 'Command \'%s\' timed out after %s seconds' % (command, timeout)
707 logger.debug(msg)
708 raise TimeoutExpired(msg)
709
710 def call_timeout_py2(command, timeout):
711 # type: (List[str], int) -> int
712 proc = subprocess.Popen(command)
713 thread = Thread(target=proc.wait)
714 thread.start()
715 thread.join(timeout)
716 if thread.is_alive():
717 proc.kill()
718 thread.join()
719 raise_timeout(command, timeout)
720 return proc.returncode
721
722 def call_timeout_py3(command, timeout):
723 # type: (List[str], int) -> int
724 try:
725 return subprocess.call(command, timeout=timeout)
726 except subprocess.TimeoutExpired as e:
727 raise_timeout(command, timeout)
728
729 ret = 1
730 if sys.version_info >= (3, 3):
731 ret = call_timeout_py3(command, timeout)
732 else:
733 # py2 subprocess has no timeout arg
734 ret = call_timeout_py2(command, timeout)
735 return ret
736
737 ##################################
738
739 def is_available(what, func):
740 # type: (str, Callable[[], bool]) -> None
741 """
742 Wait for a service to become available
743
744 :param what: the name of the service
745 :param func: the callable object that determines availability
746 """
747 retry = args.retry
748 logger.info('Waiting for %s...' % (what))
749 num = 1
750 while True:
751 if func():
752 break
753 elif num > retry:
754 raise Error('%s not available after %s tries'
755 % (what, retry))
756
757 logger.info('%s not available, waiting (%s/%s)...'
758 % (what, num, retry))
759
760 num += 1
761 time.sleep(1)
762
763
764 def read_config(fn):
765 # type: (Optional[str]) -> ConfigParser
766 # bend over backwards here because py2's ConfigParser doesn't like
767 # whitespace before config option names (e.g., '\n foo = bar\n').
768 # Yeesh!
769 if sys.version_info >= (3, 2):
770 cp = ConfigParser()
771 else:
772 cp = SafeConfigParser()
773
774 if fn:
775 with open(fn, 'r') as f:
776 raw_conf = f.read()
777 nice_conf = re.sub(r'\n(\s)+', r'\n', raw_conf)
778 s_io = StringIO(nice_conf)
779 if sys.version_info >= (3, 2):
780 cp.read_file(s_io)
781 else:
782 cp.readfp(s_io)
783
784 return cp
785
786 def pathify(p):
787 # type: (str) -> str
788 if not p.startswith('/'):
789 return os.path.join(os.getcwd(), p)
790 return p
791
792 def get_file_timestamp(fn):
793 try:
794 mt = os.path.getmtime(fn)
795 return datetime.datetime.fromtimestamp(
796 mt, tz=datetime.timezone.utc
797 ).strftime(DATEFMT)
798 except Exception as e:
799 return None
800
801 def try_convert_datetime(s):
802 # This is super irritating because
803 # 1) podman and docker use different formats
804 # 2) python's strptime can't parse either one
805 #
806 # I've seen:
807 # docker 18.09.7: 2020-03-03T09:21:43.636153304Z
808 # podman 1.7.0: 2020-03-03T15:52:30.136257504-06:00
809 # 2020-03-03 15:52:30.136257504 -0600 CST
810 # (In the podman case, there is a different string format for
811 # 'inspect' and 'inspect --format {{.Created}}'!!)
812
813 # In *all* cases, the 9 digit second precision is too much for
814 # python's strptime. Shorten it to 6 digits.
815 p = re.compile(r'(\.[\d]{6})[\d]*')
816 s = p.sub(r'\1', s)
817
818 # replace trailling Z with -0000, since (on python 3.6.8) it won't parse
819 if s and s[-1] == 'Z':
820 s = s[:-1] + '-0000'
821
822 # cut off the redundnat 'CST' part that strptime can't parse, if
823 # present.
824 v = s.split(' ')
825 s = ' '.join(v[0:3])
826
827 # try parsing with several format strings
828 fmts = [
829 '%Y-%m-%dT%H:%M:%S.%f%z',
830 '%Y-%m-%d %H:%M:%S.%f %z',
831 ]
832 for f in fmts:
833 try:
834 # return timestamp normalized to UTC, rendered as DATEFMT.
835 return datetime.datetime.strptime(s, f).astimezone(tz=datetime.timezone.utc).strftime(DATEFMT)
836 except ValueError:
837 pass
838 return None
839
840 def get_podman_version():
841 # type: () -> Tuple[int, ...]
842 if 'podman' not in container_path:
843 raise ValueError('not using podman')
844 out, _, _ = call_throws([container_path, '--version'])
845 return _parse_podman_version(out)
846
847 def _parse_podman_version(out):
848 # type: (str) -> Tuple[int, ...]
849 _, _, version_str = out.strip().split()
850
851 def to_int(val, org_e=None):
852 if not val and org_e:
853 raise org_e
854 try:
855 return int(val)
856 except ValueError as e:
857 return to_int(val[0:-1], org_e or e)
858
859 return tuple(map(to_int, version_str.split('.')))
860
861
862 def get_hostname():
863 # type: () -> str
864 return socket.gethostname()
865
866 def get_fqdn():
867 # type: () -> str
868 return socket.getfqdn() or socket.gethostname()
869
870 def get_arch():
871 # type: () -> str
872 return platform.uname().machine
873
874 def generate_service_id():
875 # type: () -> str
876 return get_hostname() + '.' + ''.join(random.choice(string.ascii_lowercase)
877 for _ in range(6))
878
879 def generate_password():
880 # type: () -> str
881 return ''.join(random.choice(string.ascii_lowercase + string.digits)
882 for i in range(10))
883
884 def normalize_container_id(i):
885 # type: (str) -> str
886 # docker adds the sha256: prefix, but AFAICS both
887 # docker (18.09.7 in bionic at least) and podman
888 # both always use sha256, so leave off the prefix
889 # for consistency.
890 prefix = 'sha256:'
891 if i.startswith(prefix):
892 i = i[len(prefix):]
893 return i
894
895 def make_fsid():
896 # type: () -> str
897 return str(uuid.uuid1())
898
899 def is_fsid(s):
900 # type: (str) -> bool
901 try:
902 uuid.UUID(s)
903 except ValueError:
904 return False
905 return True
906
907 def infer_fsid(func):
908 """
909 If we only find a single fsid in /var/lib/ceph/*, use that
910 """
911 @wraps(func)
912 def _infer_fsid():
913 if args.fsid:
914 logger.debug('Using specified fsid: %s' % args.fsid)
915 return func()
916
917 fsids = set()
918 daemon_list = list_daemons(detail=False)
919 for daemon in daemon_list:
920 if 'name' not in args or not args.name:
921 fsids.add(daemon['fsid'])
922 elif daemon['name'] == args.name:
923 fsids.add(daemon['fsid'])
924 fsids = list(fsids)
925
926 if not fsids:
927 # some commands do not always require an fsid
928 pass
929 elif len(fsids) == 1:
930 logger.info('Inferring fsid %s' % fsids[0])
931 args.fsid = fsids[0]
932 else:
933 raise Error('Cannot infer an fsid, one must be specified: %s' % fsids)
934 return func()
935
936 return _infer_fsid
937
938 def infer_image(func):
939 """
940 Use the most recent ceph image
941 """
942 @wraps(func)
943 def _infer_image():
944 if not args.image:
945 args.image = os.environ.get('CEPHADM_IMAGE')
946 if not args.image:
947 args.image = get_last_local_ceph_image()
948 if not args.image:
949 args.image = DEFAULT_IMAGE
950 return func()
951
952 return _infer_image
953
954 def default_image(func):
955 @wraps(func)
956 def _default_image():
957 if not args.image:
958 if 'name' in args and args.name:
959 type_ = args.name.split('.', 1)[0]
960 if type_ in Monitoring.components:
961 args.image = Monitoring.components[type_]['image']
962 if not args.image:
963 args.image = os.environ.get('CEPHADM_IMAGE')
964 if not args.image:
965 args.image = DEFAULT_IMAGE
966 return func()
967
968 return _default_image
969
970 def get_last_local_ceph_image():
971 """
972 :return: The most recent local ceph image (already pulled)
973 """
974 out, _, _ = call_throws(
975 [container_path, 'images',
976 '--filter', 'label=ceph=True',
977 '--format', '{{.Repository}} {{.Tag}}'])
978 out_lines = out.splitlines()
979 if len(out_lines) > 0:
980 repository, tag = out_lines[0].split()
981 r = '{}:{}'.format(repository, tag)
982 logger.info('Using recent ceph image %s' % r)
983 return r
984 return None
985
986 def write_tmp(s, uid, gid):
987 tmp_f = tempfile.NamedTemporaryFile(mode='w',
988 prefix='ceph-tmp')
989 os.fchown(tmp_f.fileno(), uid, gid)
990 tmp_f.write(s)
991 tmp_f.flush()
992
993 return tmp_f
994
995 def makedirs(dir, uid, gid, mode):
996 # type: (str, int, int, int) -> None
997 if not os.path.exists(dir):
998 os.makedirs(dir, mode=mode)
999 else:
1000 os.chmod(dir, mode)
1001 os.chown(dir, uid, gid)
1002 os.chmod(dir, mode) # the above is masked by umask...
1003
1004 def get_data_dir(fsid, t, n):
1005 # type: (str, str, Union[int, str]) -> str
1006 return os.path.join(args.data_dir, fsid, '%s.%s' % (t, n))
1007
1008 def get_log_dir(fsid):
1009 # type: (str) -> str
1010 return os.path.join(args.log_dir, fsid)
1011
1012 def make_data_dir_base(fsid, uid, gid):
1013 # type: (str, int, int) -> str
1014 data_dir_base = os.path.join(args.data_dir, fsid)
1015 makedirs(data_dir_base, uid, gid, DATA_DIR_MODE)
1016 makedirs(os.path.join(data_dir_base, 'crash'), uid, gid, DATA_DIR_MODE)
1017 makedirs(os.path.join(data_dir_base, 'crash', 'posted'), uid, gid,
1018 DATA_DIR_MODE)
1019 return data_dir_base
1020
1021 def make_data_dir(fsid, daemon_type, daemon_id, uid=None, gid=None):
1022 # type: (str, str, Union[int, str], int, int) -> str
1023 if not uid or not gid:
1024 (uid, gid) = extract_uid_gid()
1025 make_data_dir_base(fsid, uid, gid)
1026 data_dir = get_data_dir(fsid, daemon_type, daemon_id)
1027 makedirs(data_dir, uid, gid, DATA_DIR_MODE)
1028 return data_dir
1029
1030 def make_log_dir(fsid, uid=None, gid=None):
1031 # type: (str, int, int) -> str
1032 if not uid or not gid:
1033 (uid, gid) = extract_uid_gid()
1034 log_dir = get_log_dir(fsid)
1035 makedirs(log_dir, uid, gid, LOG_DIR_MODE)
1036 return log_dir
1037
1038 def make_var_run(fsid, uid, gid):
1039 # type: (str, int, int) -> None
1040 call_throws(['install', '-d', '-m0770', '-o', str(uid), '-g', str(gid),
1041 '/var/run/ceph/%s' % fsid])
1042
1043 def copy_tree(src, dst, uid=None, gid=None):
1044 # type: (List[str], str, int, int) -> None
1045 """
1046 Copy a directory tree from src to dst
1047 """
1048 if not uid or not gid:
1049 (uid, gid) = extract_uid_gid()
1050
1051 for src_dir in src:
1052 dst_dir = dst
1053 if os.path.isdir(dst):
1054 dst_dir = os.path.join(dst, os.path.basename(src_dir))
1055
1056 logger.debug('copy directory \'%s\' -> \'%s\'' % (src_dir, dst_dir))
1057 shutil.rmtree(dst_dir, ignore_errors=True)
1058 shutil.copytree(src_dir, dst_dir) # dirs_exist_ok needs python 3.8
1059
1060 for dirpath, dirnames, filenames in os.walk(dst_dir):
1061 logger.debug('chown %s:%s \'%s\'' % (uid, gid, dirpath))
1062 os.chown(dirpath, uid, gid)
1063 for filename in filenames:
1064 logger.debug('chown %s:%s \'%s\'' % (uid, gid, filename))
1065 os.chown(os.path.join(dirpath, filename), uid, gid)
1066
1067
1068 def copy_files(src, dst, uid=None, gid=None):
1069 # type: (List[str], str, int, int) -> None
1070 """
1071 Copy a files from src to dst
1072 """
1073 if not uid or not gid:
1074 (uid, gid) = extract_uid_gid()
1075
1076 for src_file in src:
1077 dst_file = dst
1078 if os.path.isdir(dst):
1079 dst_file = os.path.join(dst, os.path.basename(src_file))
1080
1081 logger.debug('copy file \'%s\' -> \'%s\'' % (src_file, dst_file))
1082 shutil.copyfile(src_file, dst_file)
1083
1084 logger.debug('chown %s:%s \'%s\'' % (uid, gid, dst_file))
1085 os.chown(dst_file, uid, gid)
1086
1087 def move_files(src, dst, uid=None, gid=None):
1088 # type: (List[str], str, int, int) -> None
1089 """
1090 Move files from src to dst
1091 """
1092 if not uid or not gid:
1093 (uid, gid) = extract_uid_gid()
1094
1095 for src_file in src:
1096 dst_file = dst
1097 if os.path.isdir(dst):
1098 dst_file = os.path.join(dst, os.path.basename(src_file))
1099
1100 if os.path.islink(src_file):
1101 # shutil.move() in py2 does not handle symlinks correctly
1102 src_rl = os.readlink(src_file)
1103 logger.debug("symlink '%s' -> '%s'" % (dst_file, src_rl))
1104 os.symlink(src_rl, dst_file)
1105 os.unlink(src_file)
1106 else:
1107 logger.debug("move file '%s' -> '%s'" % (src_file, dst_file))
1108 shutil.move(src_file, dst_file)
1109 logger.debug('chown %s:%s \'%s\'' % (uid, gid, dst_file))
1110 os.chown(dst_file, uid, gid)
1111
1112 ## copied from distutils ##
1113 def find_executable(executable, path=None):
1114 """Tries to find 'executable' in the directories listed in 'path'.
1115 A string listing directories separated by 'os.pathsep'; defaults to
1116 os.environ['PATH']. Returns the complete filename or None if not found.
1117 """
1118 _, ext = os.path.splitext(executable)
1119 if (sys.platform == 'win32') and (ext != '.exe'):
1120 executable = executable + '.exe'
1121
1122 if os.path.isfile(executable):
1123 return executable
1124
1125 if path is None:
1126 path = os.environ.get('PATH', None)
1127 if path is None:
1128 try:
1129 path = os.confstr("CS_PATH")
1130 except (AttributeError, ValueError):
1131 # os.confstr() or CS_PATH is not available
1132 path = os.defpath
1133 # bpo-35755: Don't use os.defpath if the PATH environment variable is
1134 # set to an empty string
1135
1136 # PATH='' doesn't match, whereas PATH=':' looks in the current directory
1137 if not path:
1138 return None
1139
1140 paths = path.split(os.pathsep)
1141 for p in paths:
1142 f = os.path.join(p, executable)
1143 if os.path.isfile(f):
1144 # the file exists, we have a shot at spawn working
1145 return f
1146 return None
1147
1148 def find_program(filename):
1149 # type: (str) -> str
1150 name = find_executable(filename)
1151 if name is None:
1152 raise ValueError('%s not found' % filename)
1153 return name
1154
1155 def get_unit_name(fsid, daemon_type, daemon_id=None):
1156 # type: (str, str, Optional[Union[int, str]]) -> str
1157 # accept either name or type + id
1158 if daemon_id is not None:
1159 return 'ceph-%s@%s.%s' % (fsid, daemon_type, daemon_id)
1160 else:
1161 return 'ceph-%s@%s' % (fsid, daemon_type)
1162
1163 def check_unit(unit_name):
1164 # type: (str) -> Tuple[bool, str, bool]
1165 # NOTE: we ignore the exit code here because systemctl outputs
1166 # various exit codes based on the state of the service, but the
1167 # string result is more explicit (and sufficient).
1168 enabled = False
1169 installed = False
1170 try:
1171 out, err, code = call(['systemctl', 'is-enabled', unit_name],
1172 verbose_on_failure=False)
1173 if code == 0:
1174 enabled = True
1175 installed = True
1176 elif "disabled" in out:
1177 installed = True
1178 except Exception as e:
1179 logger.warning('unable to run systemctl: %s' % e)
1180 enabled = False
1181 installed = False
1182
1183 state = 'unknown'
1184 try:
1185 out, err, code = call(['systemctl', 'is-active', unit_name],
1186 verbose_on_failure=False)
1187 out = out.strip()
1188 if out in ['active']:
1189 state = 'running'
1190 elif out in ['inactive']:
1191 state = 'stopped'
1192 elif out in ['failed', 'auto-restart']:
1193 state = 'error'
1194 else:
1195 state = 'unknown'
1196 except Exception as e:
1197 logger.warning('unable to run systemctl: %s' % e)
1198 state = 'unknown'
1199 return (enabled, state, installed)
1200
1201 def check_units(units, enabler=None):
1202 # type: (List[str], Optional[Packager]) -> bool
1203 for u in units:
1204 (enabled, state, installed) = check_unit(u)
1205 if enabled and state == 'running':
1206 logger.info('Unit %s is enabled and running' % u)
1207 return True
1208 if enabler is not None:
1209 if installed:
1210 logger.info('Enabling unit %s' % u)
1211 enabler.enable_service(u)
1212 return False
1213
1214 def get_legacy_config_fsid(cluster, legacy_dir=None):
1215 # type: (str, str) -> Optional[str]
1216 config_file = '/etc/ceph/%s.conf' % cluster
1217 if legacy_dir is not None:
1218 config_file = os.path.abspath(legacy_dir + config_file)
1219
1220 if os.path.exists(config_file):
1221 config = read_config(config_file)
1222 if config.has_section('global') and config.has_option('global', 'fsid'):
1223 return config.get('global', 'fsid')
1224 return None
1225
1226 def get_legacy_daemon_fsid(cluster, daemon_type, daemon_id, legacy_dir=None):
1227 # type: (str, str, Union[int, str], str) -> Optional[str]
1228 fsid = None
1229 if daemon_type == 'osd':
1230 try:
1231 fsid_file = os.path.join(args.data_dir,
1232 daemon_type,
1233 'ceph-%s' % daemon_id,
1234 'ceph_fsid')
1235 if legacy_dir is not None:
1236 fsid_file = os.path.abspath(legacy_dir + fsid_file)
1237 with open(fsid_file, 'r') as f:
1238 fsid = f.read().strip()
1239 except IOError:
1240 pass
1241 if not fsid:
1242 fsid = get_legacy_config_fsid(cluster, legacy_dir=legacy_dir)
1243 return fsid
1244
1245 def get_daemon_args(fsid, daemon_type, daemon_id):
1246 # type: (str, str, Union[int, str]) -> List[str]
1247 r = list() # type: List[str]
1248
1249 if daemon_type in Ceph.daemons and daemon_type != 'crash':
1250 r += [
1251 '--setuser', 'ceph',
1252 '--setgroup', 'ceph',
1253 '--default-log-to-file=false',
1254 '--default-log-to-stderr=true',
1255 '--default-log-stderr-prefix="debug "',
1256 ]
1257 if daemon_type == 'mon':
1258 r += [
1259 '--default-mon-cluster-log-to-file=false',
1260 '--default-mon-cluster-log-to-stderr=true',
1261 ]
1262 elif daemon_type in Monitoring.components:
1263 metadata = Monitoring.components[daemon_type]
1264 r += metadata.get('args', list())
1265 if daemon_type == 'alertmanager':
1266 config = get_parm(args.config_json)
1267 peers = config.get('peers', list()) # type: ignore
1268 for peer in peers:
1269 r += ["--cluster.peer={}".format(peer)]
1270 elif daemon_type == NFSGanesha.daemon_type:
1271 r += NFSGanesha.daemon_args
1272
1273 return r
1274
1275 def create_daemon_dirs(fsid, daemon_type, daemon_id, uid, gid,
1276 config=None, keyring=None,
1277 reconfig=False):
1278 # type: (str, str, Union[int, str], int, int, Optional[str], Optional[str], Optional[bool]) -> None
1279 data_dir = make_data_dir(fsid, daemon_type, daemon_id, uid=uid, gid=gid)
1280 make_log_dir(fsid, uid=uid, gid=gid)
1281
1282 if config:
1283 config_path = os.path.join(data_dir, 'config')
1284 with open(config_path, 'w') as f:
1285 os.fchown(f.fileno(), uid, gid)
1286 os.fchmod(f.fileno(), 0o600)
1287 f.write(config)
1288 if keyring:
1289 keyring_path = os.path.join(data_dir, 'keyring')
1290 with open(keyring_path, 'w') as f:
1291 os.fchmod(f.fileno(), 0o600)
1292 os.fchown(f.fileno(), uid, gid)
1293 f.write(keyring)
1294
1295 if daemon_type in Monitoring.components.keys():
1296 config = get_parm(args.config_json) # type: ignore
1297 required_files = Monitoring.components[daemon_type].get('config-json-files', list())
1298
1299 # Set up directories specific to the monitoring component
1300 config_dir = ''
1301 if daemon_type == 'prometheus':
1302 data_dir_root = get_data_dir(fsid, daemon_type, daemon_id)
1303 config_dir = 'etc/prometheus'
1304 makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755)
1305 makedirs(os.path.join(data_dir_root, config_dir, 'alerting'), uid, gid, 0o755)
1306 makedirs(os.path.join(data_dir_root, 'data'), uid, gid, 0o755)
1307 elif daemon_type == 'grafana':
1308 data_dir_root = get_data_dir(fsid, daemon_type, daemon_id)
1309 config_dir = 'etc/grafana'
1310 makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755)
1311 makedirs(os.path.join(data_dir_root, config_dir, 'certs'), uid, gid, 0o755)
1312 makedirs(os.path.join(data_dir_root, config_dir, 'provisioning/datasources'), uid, gid, 0o755)
1313 makedirs(os.path.join(data_dir_root, 'data'), uid, gid, 0o755)
1314 elif daemon_type == 'alertmanager':
1315 data_dir_root = get_data_dir(fsid, daemon_type, daemon_id)
1316 config_dir = 'etc/alertmanager'
1317 makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755)
1318 makedirs(os.path.join(data_dir_root, config_dir, 'data'), uid, gid, 0o755)
1319
1320
1321 # populate the config directory for the component from the config-json
1322 for fname in required_files:
1323 if 'files' in config: # type: ignore
1324 if isinstance(config['files'][fname], list): # type: ignore
1325 content = '\n'.join(config['files'][fname]) # type: ignore
1326 else:
1327 content = config['files'][fname] # type: ignore
1328
1329 with open(os.path.join(data_dir_root, config_dir, fname), 'w') as f:
1330 os.fchown(f.fileno(), uid, gid)
1331 os.fchmod(f.fileno(), 0o600)
1332 f.write(content)
1333
1334 if daemon_type == NFSGanesha.daemon_type:
1335 nfs_ganesha = NFSGanesha.init(fsid, daemon_id)
1336 nfs_ganesha.create_daemon_dirs(data_dir, uid, gid)
1337
1338 def get_parm(option):
1339 # type: (str) -> Dict[str, str]
1340
1341 if not option:
1342 return dict()
1343
1344 global cached_stdin
1345 if option == '-':
1346 if cached_stdin is not None:
1347 j = cached_stdin
1348 else:
1349 try:
1350 j = injected_stdin # type: ignore
1351 except NameError:
1352 j = sys.stdin.read()
1353 cached_stdin = j
1354 else:
1355 # inline json string
1356 if option[0] == '{' and option[-1] == '}':
1357 j = option
1358 # json file
1359 elif os.path.exists(option):
1360 with open(option, 'r') as f:
1361 j = f.read()
1362 else:
1363 raise Error("Config file {} not found".format(option))
1364
1365 try:
1366 js = json.loads(j)
1367 except ValueError as e:
1368 raise Error("Invalid JSON in {}: {}".format(option, e))
1369 else:
1370 return js
1371
1372 def get_config_and_keyring():
1373 # type: () -> Tuple[str, str]
1374 if 'config_json' in args and args.config_json:
1375 d = get_parm(args.config_json)
1376 config = d.get('config')
1377 keyring = d.get('keyring')
1378
1379 if 'config' in args and args.config:
1380 with open(args.config, 'r') as f:
1381 config = f.read()
1382
1383 if 'key' in args and args.key:
1384 keyring = '[%s]\n\tkey = %s\n' % (args.name, args.key)
1385 elif 'keyring' in args and args.keyring:
1386 with open(args.keyring, 'r') as f:
1387 keyring = f.read()
1388
1389 if not config:
1390 raise Error('no config provided')
1391 elif not keyring:
1392 raise Error('no keyring provided')
1393
1394 return (config, keyring)
1395
1396 def get_container_mounts(fsid, daemon_type, daemon_id,
1397 no_config=False):
1398 # type: (str, str, Union[int, str, None], Optional[bool]) -> Dict[str, str]
1399 mounts = dict()
1400
1401 if daemon_type in Ceph.daemons:
1402 if fsid:
1403 run_path = os.path.join('/var/run/ceph', fsid);
1404 if os.path.exists(run_path):
1405 mounts[run_path] = '/var/run/ceph:z'
1406 log_dir = get_log_dir(fsid)
1407 mounts[log_dir] = '/var/log/ceph:z'
1408 crash_dir = '/var/lib/ceph/%s/crash' % fsid
1409 if os.path.exists(crash_dir):
1410 mounts[crash_dir] = '/var/lib/ceph/crash:z'
1411
1412 if daemon_type in Ceph.daemons and daemon_id:
1413 data_dir = get_data_dir(fsid, daemon_type, daemon_id)
1414 if daemon_type == 'rgw':
1415 cdata_dir = '/var/lib/ceph/radosgw/ceph-rgw.%s' % (daemon_id)
1416 else:
1417 cdata_dir = '/var/lib/ceph/%s/ceph-%s' % (daemon_type, daemon_id)
1418 if daemon_type != 'crash':
1419 mounts[data_dir] = cdata_dir + ':z'
1420 if not no_config:
1421 mounts[data_dir + '/config'] = '/etc/ceph/ceph.conf:z'
1422 if daemon_type == 'rbd-mirror' or daemon_type == 'crash':
1423 # these do not search for their keyrings in a data directory
1424 mounts[data_dir + '/keyring'] = '/etc/ceph/ceph.client.%s.%s.keyring' % (daemon_type, daemon_id)
1425
1426 if daemon_type in ['mon', 'osd']:
1427 mounts['/dev'] = '/dev' # FIXME: narrow this down?
1428 mounts['/run/udev'] = '/run/udev'
1429 if daemon_type == 'osd':
1430 mounts['/sys'] = '/sys' # for numa.cc, pick_address, cgroups, ...
1431 mounts['/run/lvm'] = '/run/lvm'
1432 mounts['/run/lock/lvm'] = '/run/lock/lvm'
1433
1434 if daemon_type in Monitoring.components and daemon_id:
1435 data_dir = get_data_dir(fsid, daemon_type, daemon_id)
1436 if daemon_type == 'prometheus':
1437 mounts[os.path.join(data_dir, 'etc/prometheus')] = '/etc/prometheus:Z'
1438 mounts[os.path.join(data_dir, 'data')] = '/prometheus:Z'
1439 elif daemon_type == 'node-exporter':
1440 mounts['/proc'] = '/host/proc:ro'
1441 mounts['/sys'] = '/host/sys:ro'
1442 mounts['/'] = '/rootfs:ro'
1443 elif daemon_type == "grafana":
1444 mounts[os.path.join(data_dir, 'etc/grafana/grafana.ini')] = '/etc/grafana/grafana.ini:Z'
1445 mounts[os.path.join(data_dir, 'etc/grafana/provisioning/datasources')] = '/etc/grafana/provisioning/datasources:Z'
1446 mounts[os.path.join(data_dir, 'etc/grafana/certs')] = '/etc/grafana/certs:Z'
1447 elif daemon_type == 'alertmanager':
1448 mounts[os.path.join(data_dir, 'etc/alertmanager')] = '/alertmanager:Z'
1449
1450 if daemon_type == NFSGanesha.daemon_type:
1451 assert daemon_id
1452 data_dir = get_data_dir(fsid, daemon_type, daemon_id)
1453 mounts.update(NFSGanesha.get_container_mounts(data_dir))
1454
1455 return mounts
1456
1457 def get_container(fsid, daemon_type, daemon_id,
1458 privileged=False,
1459 ptrace=False,
1460 container_args=[]):
1461 # type: (str, str, Union[int, str], bool, bool, List[str]) -> CephContainer
1462 if daemon_type in ['mon', 'osd']:
1463 # mon and osd need privileged in order for libudev to query devices
1464 privileged = True
1465 if daemon_type == 'rgw':
1466 entrypoint = '/usr/bin/radosgw'
1467 name = 'client.rgw.%s' % daemon_id
1468 elif daemon_type == 'rbd-mirror':
1469 entrypoint = '/usr/bin/rbd-mirror'
1470 name = 'client.rbd-mirror.%s' % daemon_id
1471 elif daemon_type == 'crash':
1472 entrypoint = '/usr/bin/ceph-crash'
1473 name = 'client.crash.%s' % daemon_id
1474 elif daemon_type in ['mon', 'mgr', 'mds', 'osd']:
1475 entrypoint = '/usr/bin/ceph-' + daemon_type
1476 name = '%s.%s' % (daemon_type, daemon_id)
1477 elif daemon_type in Monitoring.components:
1478 entrypoint = ''
1479 name = ''
1480 elif daemon_type == NFSGanesha.daemon_type:
1481 entrypoint = NFSGanesha.entrypoint
1482 name = '%s.%s' % (daemon_type, daemon_id)
1483 else:
1484 entrypoint = ''
1485 name = ''
1486
1487 ceph_args = [] # type: List[str]
1488 if daemon_type in Monitoring.components:
1489 uid, gid = extract_uid_gid_monitoring(daemon_type)
1490 m = Monitoring.components[daemon_type] # type: ignore
1491 metadata = m.get('image', dict()) # type: ignore
1492 monitoring_args = [
1493 '--user',
1494 str(uid),
1495 # FIXME: disable cpu/memory limits for the time being (not supported
1496 # by ubuntu 18.04 kernel!)
1497 #'--cpus',
1498 #metadata.get('cpus', '2'),
1499 #'--memory',
1500 #metadata.get('memory', '4GB')
1501 ]
1502 container_args.extend(monitoring_args)
1503 elif daemon_type == 'crash':
1504 ceph_args = ['-n', name]
1505 elif daemon_type in Ceph.daemons:
1506 ceph_args = ['-n', name, '-f']
1507
1508 envs=[] # type: List[str]
1509 if daemon_type == NFSGanesha.daemon_type:
1510 envs.extend(NFSGanesha.get_container_envs())
1511
1512 return CephContainer(
1513 image=args.image,
1514 entrypoint=entrypoint,
1515 args=ceph_args + get_daemon_args(fsid, daemon_type, daemon_id),
1516 container_args=container_args,
1517 volume_mounts=get_container_mounts(fsid, daemon_type, daemon_id),
1518 cname='ceph-%s-%s.%s' % (fsid, daemon_type, daemon_id),
1519 envs=envs,
1520 privileged=privileged,
1521 ptrace=ptrace,
1522 )
1523
1524 def extract_uid_gid(img='', file_path='/var/lib/ceph'):
1525 # type: (str, str) -> Tuple[int, int]
1526
1527 if not img:
1528 img = args.image
1529
1530 out = CephContainer(
1531 image=img,
1532 entrypoint='stat',
1533 args=['-c', '%u %g', file_path]
1534 ).run()
1535 (uid, gid) = out.split(' ')
1536 return (int(uid), int(gid))
1537
1538 def deploy_daemon(fsid, daemon_type, daemon_id, c, uid, gid,
1539 config=None, keyring=None,
1540 osd_fsid=None,
1541 reconfig=False):
1542 # type: (str, str, Union[int, str], CephContainer, int, int, Optional[str], Optional[str], Optional[str], Optional[bool]) -> None
1543 data_dir = get_data_dir(fsid, daemon_type, daemon_id)
1544 if reconfig and not os.path.exists(data_dir):
1545 raise Error('cannot reconfig, data path %s does not exist' % data_dir)
1546 if daemon_type == 'mon' and not os.path.exists(data_dir):
1547 assert config
1548 assert keyring
1549 # tmp keyring file
1550 tmp_keyring = write_tmp(keyring, uid, gid)
1551
1552 # tmp config file
1553 tmp_config = write_tmp(config, uid, gid)
1554
1555 # --mkfs
1556 create_daemon_dirs(fsid, daemon_type, daemon_id, uid, gid)
1557 mon_dir = get_data_dir(fsid, 'mon', daemon_id)
1558 log_dir = get_log_dir(fsid)
1559 out = CephContainer(
1560 image=args.image,
1561 entrypoint='/usr/bin/ceph-mon',
1562 args=['--mkfs',
1563 '-i', str(daemon_id),
1564 '--fsid', fsid,
1565 '-c', '/tmp/config',
1566 '--keyring', '/tmp/keyring',
1567 ] + get_daemon_args(fsid, 'mon', daemon_id),
1568 volume_mounts={
1569 log_dir: '/var/log/ceph:z',
1570 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % (daemon_id),
1571 tmp_keyring.name: '/tmp/keyring:z',
1572 tmp_config.name: '/tmp/config:z',
1573 },
1574 ).run()
1575
1576 # write conf
1577 with open(mon_dir + '/config', 'w') as f:
1578 os.fchown(f.fileno(), uid, gid)
1579 os.fchmod(f.fileno(), 0o600)
1580 f.write(config)
1581 else:
1582 # dirs, conf, keyring
1583 create_daemon_dirs(
1584 fsid, daemon_type, daemon_id,
1585 uid, gid,
1586 config, keyring)
1587
1588 if not reconfig:
1589 deploy_daemon_units(fsid, uid, gid, daemon_type, daemon_id, c,
1590 osd_fsid=osd_fsid)
1591
1592 if not os.path.exists(data_dir + '/unit.created'):
1593 with open(data_dir + '/unit.created', 'w') as f:
1594 os.fchmod(f.fileno(), 0o600)
1595 os.fchown(f.fileno(), uid, gid)
1596 f.write('mtime is time the daemon deployment was created\n')
1597
1598 with open(data_dir + '/unit.configured', 'w') as f:
1599 f.write('mtime is time we were last configured\n')
1600 os.fchmod(f.fileno(), 0o600)
1601 os.fchown(f.fileno(), uid, gid)
1602
1603 update_firewalld(daemon_type)
1604
1605 if reconfig and daemon_type not in Ceph.daemons:
1606 # ceph daemons do not need a restart; others (presumably) do to pick
1607 # up the new config
1608 call_throws(['systemctl', 'reset-failed',
1609 get_unit_name(fsid, daemon_type, daemon_id)])
1610 call_throws(['systemctl', 'restart',
1611 get_unit_name(fsid, daemon_type, daemon_id)])
1612
1613 def deploy_daemon_units(fsid, uid, gid, daemon_type, daemon_id, c,
1614 enable=True, start=True,
1615 osd_fsid=None):
1616 # type: (str, int, int, str, Union[int, str], CephContainer, bool, bool, Optional[str]) -> None
1617 # cmd
1618 data_dir = get_data_dir(fsid, daemon_type, daemon_id)
1619 with open(data_dir + '/unit.run.new', 'w') as f:
1620 # pre-start cmd(s)
1621 if daemon_type == 'osd':
1622 # osds have a pre-start step
1623 assert osd_fsid
1624 prestart = CephContainer(
1625 image=args.image,
1626 entrypoint='/usr/sbin/ceph-volume',
1627 args=[
1628 'lvm', 'activate',
1629 str(daemon_id), osd_fsid,
1630 '--no-systemd'
1631 ],
1632 privileged=True,
1633 volume_mounts=get_container_mounts(fsid, daemon_type, daemon_id),
1634 cname='ceph-%s-%s.%s-activate' % (fsid, daemon_type, daemon_id),
1635 )
1636 f.write(' '.join(prestart.run_cmd()) + '\n')
1637 elif daemon_type == NFSGanesha.daemon_type:
1638 # add nfs to the rados grace db
1639 nfs_ganesha = NFSGanesha.init(fsid, daemon_id)
1640 prestart = nfs_ganesha.get_rados_grace_container('add')
1641 f.write(' '.join(prestart.run_cmd()) + '\n')
1642
1643 # container run command
1644 f.write(' '.join(c.run_cmd()) + '\n')
1645 os.fchmod(f.fileno(), 0o600)
1646 os.rename(data_dir + '/unit.run.new',
1647 data_dir + '/unit.run')
1648
1649 # post-stop command(s)
1650 with open(data_dir + '/unit.poststop.new', 'w') as f:
1651 if daemon_type == 'osd':
1652 assert osd_fsid
1653 poststop = CephContainer(
1654 image=args.image,
1655 entrypoint='/usr/sbin/ceph-volume',
1656 args=[
1657 'lvm', 'deactivate',
1658 str(daemon_id), osd_fsid,
1659 ],
1660 privileged=True,
1661 volume_mounts=get_container_mounts(fsid, daemon_type, daemon_id),
1662 cname='ceph-%s-%s.%s-deactivate' % (fsid, daemon_type,
1663 daemon_id),
1664 )
1665 f.write(' '.join(poststop.run_cmd()) + '\n')
1666 elif daemon_type == NFSGanesha.daemon_type:
1667 # remove nfs from the rados grace db
1668 nfs_ganesha = NFSGanesha.init(fsid, daemon_id)
1669 poststop = nfs_ganesha.get_rados_grace_container('remove')
1670 f.write(' '.join(poststop.run_cmd()) + '\n')
1671 os.fchmod(f.fileno(), 0o600)
1672 os.rename(data_dir + '/unit.poststop.new',
1673 data_dir + '/unit.poststop')
1674
1675 with open(data_dir + '/unit.image.new', 'w') as f:
1676 f.write(c.image + '\n')
1677 os.fchmod(f.fileno(), 0o600)
1678 os.rename(data_dir + '/unit.image.new',
1679 data_dir + '/unit.image')
1680
1681 # systemd
1682 install_base_units(fsid)
1683 unit = get_unit_file(fsid, uid, gid)
1684 unit_file = 'ceph-%s@.service' % (fsid)
1685 with open(args.unit_dir + '/' + unit_file + '.new', 'w') as f:
1686 f.write(unit)
1687 os.rename(args.unit_dir + '/' + unit_file + '.new',
1688 args.unit_dir + '/' + unit_file)
1689 call_throws(['systemctl', 'daemon-reload'])
1690
1691 unit_name = get_unit_name(fsid, daemon_type, daemon_id)
1692 call(['systemctl', 'stop', unit_name],
1693 verbose_on_failure=False)
1694 call(['systemctl', 'reset-failed', unit_name],
1695 verbose_on_failure=False)
1696 if enable:
1697 call_throws(['systemctl', 'enable', unit_name])
1698 if start:
1699 call_throws(['systemctl', 'start', unit_name])
1700
1701 def update_firewalld(daemon_type):
1702 # type: (str) -> None
1703 if args.skip_firewalld:
1704 return
1705 cmd = find_executable('firewall-cmd')
1706 if not cmd:
1707 logger.debug('firewalld does not appear to be present')
1708 return
1709 (enabled, state, _) = check_unit('firewalld.service')
1710 if not enabled:
1711 logger.debug('firewalld.service is not enabled')
1712 return
1713
1714 fw_services = []
1715 fw_ports = []
1716 if daemon_type == 'mon':
1717 fw_services.append('ceph-mon')
1718 elif daemon_type in ['mgr', 'mds', 'osd']:
1719 fw_services.append('ceph')
1720 if daemon_type == 'mgr':
1721 fw_ports.append(8080) # dashboard
1722 fw_ports.append(8443) # dashboard
1723 fw_ports.append(9283) # mgr/prometheus exporter
1724 elif daemon_type in Monitoring.port_map.keys():
1725 fw_ports.extend(Monitoring.port_map[daemon_type]) # prometheus etc
1726 elif daemon_type == NFSGanesha.daemon_type:
1727 fw_services.append('nfs')
1728
1729 for svc in fw_services:
1730 out, err, ret = call([cmd, '--permanent', '--query-service', svc])
1731 if ret:
1732 logger.info('Enabling firewalld service %s in current zone...' % svc)
1733 out, err, ret = call([cmd, '--permanent', '--add-service', svc])
1734 if ret:
1735 raise RuntimeError(
1736 'unable to add service %s to current zone: %s' % (svc, err))
1737 else:
1738 logger.debug('firewalld service %s is enabled in current zone' % svc)
1739 for port in fw_ports:
1740 tcp_port = str(port) + '/tcp'
1741 out, err, ret = call([cmd, '--permanent', '--query-port', tcp_port])
1742 if ret:
1743 logger.info('Enabling firewalld port %s in current zone...' % tcp_port)
1744 out, err, ret = call([cmd, '--permanent', '--add-port', tcp_port])
1745 if ret:
1746 raise RuntimeError('unable to add port %s to current zone: %s' %
1747 (tcp_port, err))
1748 else:
1749 logger.debug('firewalld port %s is enabled in current zone' % tcp_port)
1750 call_throws([cmd, '--reload'])
1751
1752 def install_base_units(fsid):
1753 # type: (str) -> None
1754 """
1755 Set up ceph.target and ceph-$fsid.target units.
1756 """
1757 # global unit
1758 existed = os.path.exists(args.unit_dir + '/ceph.target')
1759 with open(args.unit_dir + '/ceph.target.new', 'w') as f:
1760 f.write('[Unit]\n'
1761 'Description=All Ceph clusters and services\n'
1762 '\n'
1763 '[Install]\n'
1764 'WantedBy=multi-user.target\n')
1765 os.rename(args.unit_dir + '/ceph.target.new',
1766 args.unit_dir + '/ceph.target')
1767 if not existed:
1768 # we disable before enable in case a different ceph.target
1769 # (from the traditional package) is present; while newer
1770 # systemd is smart enough to disable the old
1771 # (/lib/systemd/...) and enable the new (/etc/systemd/...),
1772 # some older versions of systemd error out with EEXIST.
1773 call_throws(['systemctl', 'disable', 'ceph.target'])
1774 call_throws(['systemctl', 'enable', 'ceph.target'])
1775 call_throws(['systemctl', 'start', 'ceph.target'])
1776
1777 # cluster unit
1778 existed = os.path.exists(args.unit_dir + '/ceph-%s.target' % fsid)
1779 with open(args.unit_dir + '/ceph-%s.target.new' % fsid, 'w') as f:
1780 f.write('[Unit]\n'
1781 'Description=Ceph cluster {fsid}\n'
1782 'PartOf=ceph.target\n'
1783 'Before=ceph.target\n'
1784 '\n'
1785 '[Install]\n'
1786 'WantedBy=multi-user.target ceph.target\n'.format(
1787 fsid=fsid)
1788 )
1789 os.rename(args.unit_dir + '/ceph-%s.target.new' % fsid,
1790 args.unit_dir + '/ceph-%s.target' % fsid)
1791 if not existed:
1792 call_throws(['systemctl', 'enable', 'ceph-%s.target' % fsid])
1793 call_throws(['systemctl', 'start', 'ceph-%s.target' % fsid])
1794
1795 # logrotate for the cluster
1796 with open(args.logrotate_dir + '/ceph-%s' % fsid, 'w') as f:
1797 """
1798 This is a bit sloppy in that the killall/pkill will touch all ceph daemons
1799 in all containers, but I don't see an elegant way to send SIGHUP *just* to
1800 the daemons for this cluster. (1) systemd kill -s will get the signal to
1801 podman, but podman will exit. (2) podman kill will get the signal to the
1802 first child (bash), but that isn't the ceph daemon. This is simpler and
1803 should be harmless.
1804 """
1805 f.write("""# created by cephadm
1806 /var/log/ceph/%s/*.log {
1807 rotate 7
1808 daily
1809 compress
1810 sharedscripts
1811 postrotate
1812 killall -q -1 ceph-mon ceph-mgr ceph-mds ceph-osd ceph-fuse radosgw rbd-mirror || pkill -1 -x "ceph-mon|ceph-mgr|ceph-mds|ceph-osd|ceph-fuse|radosgw|rbd-mirror" || true
1813 endscript
1814 missingok
1815 notifempty
1816 su root root
1817 }
1818 """ % fsid)
1819
1820 def get_unit_file(fsid, uid, gid):
1821 # type: (str, int, int) -> str
1822 install_path = find_program('install')
1823 u = """# generated by cephadm
1824 [Unit]
1825 Description=Ceph %i for {fsid}
1826
1827 # According to:
1828 # http://www.freedesktop.org/wiki/Software/systemd/NetworkTarget
1829 # these can be removed once ceph-mon will dynamically change network
1830 # configuration.
1831 After=network-online.target local-fs.target time-sync.target
1832 Wants=network-online.target local-fs.target time-sync.target
1833
1834 PartOf=ceph-{fsid}.target
1835 Before=ceph-{fsid}.target
1836
1837 [Service]
1838 LimitNOFILE=1048576
1839 LimitNPROC=1048576
1840 EnvironmentFile=-/etc/environment
1841 ExecStartPre=-{container_path} rm ceph-{fsid}-%i
1842 ExecStartPre=-{install_path} -d -m0770 -o {uid} -g {gid} /var/run/ceph/{fsid}
1843 ExecStart=/bin/bash {data_dir}/{fsid}/%i/unit.run
1844 ExecStop=-{container_path} stop ceph-{fsid}-%i
1845 ExecStopPost=-/bin/bash {data_dir}/{fsid}/%i/unit.poststop
1846 KillMode=none
1847 Restart=on-failure
1848 RestartSec=10s
1849 TimeoutStartSec=120
1850 TimeoutStopSec=15
1851 StartLimitInterval=30min
1852 StartLimitBurst=5
1853
1854 [Install]
1855 WantedBy=ceph-{fsid}.target
1856 """.format(
1857 container_path=container_path,
1858 install_path=install_path,
1859 fsid=fsid,
1860 uid=uid,
1861 gid=gid,
1862 data_dir=args.data_dir)
1863 return u
1864
1865 ##################################
1866
1867 class CephContainer:
1868 def __init__(self,
1869 image,
1870 entrypoint,
1871 args=[],
1872 volume_mounts={},
1873 cname='',
1874 container_args=[],
1875 envs=None,
1876 privileged=False,
1877 ptrace=False):
1878 # type: (str, str, List[str], Dict[str, str], str, List[str], Optional[List[str]], bool, bool) -> None
1879 self.image = image
1880 self.entrypoint = entrypoint
1881 self.args = args
1882 self.volume_mounts = volume_mounts
1883 self.cname = cname
1884 self.container_args = container_args
1885 self.envs = envs
1886 self.privileged = privileged
1887 self.ptrace = ptrace
1888
1889 def run_cmd(self):
1890 # type: () -> List[str]
1891 vols = [] # type: List[str]
1892 envs = [] # type: List[str]
1893 cname = [] # type: List[str]
1894 entrypoint = [] # type: List[str]
1895 if self.entrypoint:
1896 entrypoint = ['--entrypoint', self.entrypoint]
1897
1898 priv = [] # type: List[str]
1899 if self.privileged:
1900 priv = ['--privileged',
1901 # let OSD etc read block devs that haven't been chowned
1902 '--group-add=disk']
1903 if self.ptrace:
1904 priv.append('--cap-add=SYS_PTRACE')
1905 vols = sum(
1906 [['-v', '%s:%s' % (host_dir, container_dir)]
1907 for host_dir, container_dir in self.volume_mounts.items()], [])
1908 envs = [
1909 '-e', 'CONTAINER_IMAGE=%s' % self.image,
1910 '-e', 'NODE_NAME=%s' % get_hostname(),
1911 ]
1912 if self.envs:
1913 for e in self.envs:
1914 envs.extend(['-e', e])
1915 cname = ['--name', self.cname] if self.cname else []
1916 return [
1917 str(container_path),
1918 'run',
1919 '--rm',
1920 '--net=host',
1921 ] + self.container_args + priv + \
1922 cname + envs + \
1923 vols + entrypoint + \
1924 [
1925 self.image
1926 ] + self.args # type: ignore
1927
1928 def shell_cmd(self, cmd):
1929 # type: (List[str]) -> List[str]
1930 priv = [] # type: List[str]
1931 if self.privileged:
1932 priv = ['--privileged',
1933 # let OSD etc read block devs that haven't been chowned
1934 '--group-add=disk']
1935 vols = [] # type: List[str]
1936 vols = sum(
1937 [['-v', '%s:%s' % (host_dir, container_dir)]
1938 for host_dir, container_dir in self.volume_mounts.items()], [])
1939 envs = [
1940 '-e', 'CONTAINER_IMAGE=%s' % self.image,
1941 '-e', 'NODE_NAME=%s' % get_hostname(),
1942 ]
1943 if self.envs:
1944 for e in self.envs:
1945 envs.extend(['-e', e])
1946 cmd_args = [] # type: List[str]
1947 if cmd:
1948 cmd_args = ['-c'] + cmd
1949 return [
1950 str(container_path),
1951 'run',
1952 '--rm',
1953 '--net=host',
1954 ] + self.container_args + priv + envs + vols + [
1955 '--entrypoint', cmd[0],
1956 self.image
1957 ] + cmd[1:]
1958
1959 def exec_cmd(self, cmd):
1960 # type: (List[str]) -> List[str]
1961 return [
1962 str(container_path),
1963 'exec',
1964 ] + self.container_args + [
1965 self.cname,
1966 ] + cmd
1967
1968 def run(self, timeout=DEFAULT_TIMEOUT):
1969 # type: (Optional[int]) -> str
1970 logger.debug(self.run_cmd())
1971 out, _, _ = call_throws(
1972 self.run_cmd(), desc=self.entrypoint, timeout=timeout)
1973 return out
1974
1975 ##################################
1976
1977 @infer_image
1978 def command_version():
1979 # type: () -> int
1980 out = CephContainer(args.image, 'ceph', ['--version']).run()
1981 print(out.strip())
1982 return 0
1983
1984 ##################################
1985
1986 @infer_image
1987 def command_pull():
1988 # type: () -> int
1989 logger.info('Pulling latest %s...' % args.image)
1990 call_throws([container_path, 'pull', args.image])
1991 return command_inspect_image()
1992
1993 ##################################
1994
1995 @infer_image
1996 def command_inspect_image():
1997 # type: () -> int
1998 out, err, ret = call_throws([
1999 container_path, 'inspect',
2000 '--format', '{{.Id}}',
2001 args.image])
2002 if ret:
2003 return errno.ENOENT
2004 image_id = normalize_container_id(out.strip())
2005 ver = CephContainer(args.image, 'ceph', ['--version']).run().strip()
2006 r = {
2007 'image_id': image_id,
2008 'ceph_version': ver,
2009 }
2010 print(json.dumps(r, indent=4, sort_keys=True))
2011 return 0
2012
2013 ##################################
2014
2015 @default_image
2016 def command_bootstrap():
2017 # type: () -> int
2018
2019 if not args.output_config:
2020 args.output_config = os.path.join(args.output_dir, 'ceph.conf')
2021 if not args.output_keyring:
2022 args.output_keyring = os.path.join(args.output_dir,
2023 'ceph.client.admin.keyring')
2024 if not args.output_pub_ssh_key:
2025 args.output_pub_ssh_key = os.path.join(args.output_dir, 'ceph.pub')
2026
2027 # verify output files
2028 for f in [args.output_config, args.output_keyring, args.output_pub_ssh_key]:
2029 if not args.allow_overwrite:
2030 if os.path.exists(f):
2031 raise Error('%s already exists; delete or pass '
2032 '--allow-overwrite to overwrite' % f)
2033 dirname = os.path.dirname(f)
2034 if dirname and not os.path.exists(dirname):
2035 raise Error('%s directory %s does not exist' % (f, dirname))
2036
2037 if not args.skip_prepare_host:
2038 command_prepare_host()
2039 else:
2040 logger.info('Skip prepare_host')
2041
2042 # initial vars
2043 fsid = args.fsid or make_fsid()
2044 hostname = get_hostname()
2045 if '.' in hostname and not args.allow_fqdn_hostname:
2046 raise Error('hostname is a fully qualified domain name (%s); either fix (e.g., "sudo hostname %s" or similar) or pass --allow-fqdn-hostname' % (hostname, hostname.split('.')[0]))
2047 mon_id = args.mon_id or hostname
2048 mgr_id = args.mgr_id or generate_service_id()
2049 logging.info('Cluster fsid: %s' % fsid)
2050
2051 l = FileLock(fsid)
2052 l.acquire()
2053
2054 # ip
2055 r = re.compile(r':(\d+)$')
2056 base_ip = None
2057 if args.mon_ip:
2058 hasport = r.findall(args.mon_ip)
2059 if hasport:
2060 port = int(hasport[0])
2061 if port == 6789:
2062 addr_arg = '[v1:%s]' % args.mon_ip
2063 elif port == 3300:
2064 addr_arg = '[v2:%s]' % args.mon_ip
2065 else:
2066 logger.warning('Using msgr2 protocol for unrecognized port %d' %
2067 port)
2068 addr_arg = '[v2:%s]' % args.mon_ip
2069 base_ip = args.mon_ip[0:-(len(str(port)))-1]
2070 check_ip_port(base_ip, port)
2071 else:
2072 base_ip = args.mon_ip
2073 addr_arg = '[v2:%s:3300,v1:%s:6789]' % (args.mon_ip, args.mon_ip)
2074 check_ip_port(args.mon_ip, 3300)
2075 check_ip_port(args.mon_ip, 6789)
2076 elif args.mon_addrv:
2077 addr_arg = args.mon_addrv
2078 if addr_arg[0] != '[' or addr_arg[-1] != ']':
2079 raise Error('--mon-addrv value %s must use square backets' %
2080 addr_arg)
2081 for addr in addr_arg[1:-1].split(','):
2082 hasport = r.findall(addr)
2083 if not hasport:
2084 raise Error('--mon-addrv value %s must include port number' %
2085 addr_arg)
2086 port = int(hasport[0])
2087 # strip off v1: or v2: prefix
2088 addr = re.sub(r'^\w+:', '', addr)
2089 base_ip = addr[0:-(len(str(port)))-1]
2090 check_ip_port(base_ip, port)
2091 else:
2092 raise Error('must specify --mon-ip or --mon-addrv')
2093 logger.debug('Base mon IP is %s, final addrv is %s' % (base_ip, addr_arg))
2094
2095 mon_network = None
2096 if not args.skip_mon_network:
2097 # make sure IP is configured locally, and then figure out the
2098 # CIDR network
2099 for net, ips in list_networks().items():
2100 if base_ip in ips:
2101 mon_network = net
2102 logger.info('Mon IP %s is in CIDR network %s' % (base_ip,
2103 mon_network))
2104 break
2105 if not mon_network:
2106 raise Error('Failed to infer CIDR network for mon ip %s; pass '
2107 '--skip-mon-network to configure it later' % base_ip)
2108
2109 # config
2110 cp = read_config(args.config)
2111 if not cp.has_section('global'):
2112 cp.add_section('global')
2113 cp.set('global', 'fsid', fsid);
2114 cp.set('global', 'mon host', addr_arg)
2115 cp.set('global', 'container_image', args.image)
2116 cpf = StringIO()
2117 cp.write(cpf)
2118 config = cpf.getvalue()
2119
2120 if not args.skip_pull:
2121 logger.info('Pulling latest %s container...' % args.image)
2122 call_throws([container_path, 'pull', args.image])
2123
2124 logger.info('Extracting ceph user uid/gid from container image...')
2125 (uid, gid) = extract_uid_gid()
2126
2127 # create some initial keys
2128 logger.info('Creating initial keys...')
2129 mon_key = CephContainer(
2130 image=args.image,
2131 entrypoint='/usr/bin/ceph-authtool',
2132 args=['--gen-print-key'],
2133 ).run().strip()
2134 admin_key = CephContainer(
2135 image=args.image,
2136 entrypoint='/usr/bin/ceph-authtool',
2137 args=['--gen-print-key'],
2138 ).run().strip()
2139 mgr_key = CephContainer(
2140 image=args.image,
2141 entrypoint='/usr/bin/ceph-authtool',
2142 args=['--gen-print-key'],
2143 ).run().strip()
2144
2145 keyring = ('[mon.]\n'
2146 '\tkey = %s\n'
2147 '\tcaps mon = allow *\n'
2148 '[client.admin]\n'
2149 '\tkey = %s\n'
2150 '\tcaps mon = allow *\n'
2151 '\tcaps mds = allow *\n'
2152 '\tcaps mgr = allow *\n'
2153 '\tcaps osd = allow *\n'
2154 '[mgr.%s]\n'
2155 '\tkey = %s\n'
2156 '\tcaps mon = profile mgr\n'
2157 '\tcaps mds = allow *\n'
2158 '\tcaps osd = allow *\n'
2159 % (mon_key, admin_key, mgr_id, mgr_key))
2160
2161 # tmp keyring file
2162 tmp_bootstrap_keyring = write_tmp(keyring, uid, gid)
2163
2164 # create initial monmap, tmp monmap file
2165 logger.info('Creating initial monmap...')
2166 tmp_monmap = write_tmp('', 0, 0)
2167 out = CephContainer(
2168 image=args.image,
2169 entrypoint='/usr/bin/monmaptool',
2170 args=['--create',
2171 '--clobber',
2172 '--fsid', fsid,
2173 '--addv', mon_id, addr_arg,
2174 '/tmp/monmap'
2175 ],
2176 volume_mounts={
2177 tmp_monmap.name: '/tmp/monmap:z',
2178 },
2179 ).run()
2180
2181 # pass monmap file to ceph user for use by ceph-mon --mkfs below
2182 os.fchown(tmp_monmap.fileno(), uid, gid)
2183
2184 # create mon
2185 logger.info('Creating mon...')
2186 create_daemon_dirs(fsid, 'mon', mon_id, uid, gid)
2187 mon_dir = get_data_dir(fsid, 'mon', mon_id)
2188 log_dir = get_log_dir(fsid)
2189 out = CephContainer(
2190 image=args.image,
2191 entrypoint='/usr/bin/ceph-mon',
2192 args=['--mkfs',
2193 '-i', mon_id,
2194 '--fsid', fsid,
2195 '-c', '/dev/null',
2196 '--monmap', '/tmp/monmap',
2197 '--keyring', '/tmp/keyring',
2198 ] + get_daemon_args(fsid, 'mon', mon_id),
2199 volume_mounts={
2200 log_dir: '/var/log/ceph:z',
2201 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % (mon_id),
2202 tmp_bootstrap_keyring.name: '/tmp/keyring:z',
2203 tmp_monmap.name: '/tmp/monmap:z',
2204 },
2205 ).run()
2206
2207 with open(mon_dir + '/config', 'w') as f:
2208 os.fchown(f.fileno(), uid, gid)
2209 os.fchmod(f.fileno(), 0o600)
2210 f.write(config)
2211
2212 make_var_run(fsid, uid, gid)
2213 mon_c = get_container(fsid, 'mon', mon_id)
2214 deploy_daemon(fsid, 'mon', mon_id, mon_c, uid, gid,
2215 config=None, keyring=None)
2216
2217 # client.admin key + config to issue various CLI commands
2218 tmp_admin_keyring = write_tmp('[client.admin]\n'
2219 '\tkey = ' + admin_key + '\n',
2220 uid, gid)
2221 tmp_config = write_tmp(config, uid, gid)
2222
2223 # a CLI helper to reduce our typing
2224 def cli(cmd, extra_mounts={}, timeout=DEFAULT_TIMEOUT):
2225 # type: (List[str], Dict[str, str], Optional[int]) -> str
2226 mounts = {
2227 log_dir: '/var/log/ceph:z',
2228 tmp_admin_keyring.name: '/etc/ceph/ceph.client.admin.keyring:z',
2229 tmp_config.name: '/etc/ceph/ceph.conf:z',
2230 }
2231 for k, v in extra_mounts.items():
2232 mounts[k] = v
2233 timeout = timeout or args.timeout
2234 return CephContainer(
2235 image=args.image,
2236 entrypoint='/usr/bin/ceph',
2237 args=cmd,
2238 volume_mounts=mounts,
2239 ).run(timeout=timeout)
2240
2241 logger.info('Waiting for mon to start...')
2242 c = CephContainer(
2243 image=args.image,
2244 entrypoint='/usr/bin/ceph',
2245 args=[
2246 'status'],
2247 volume_mounts={
2248 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % (mon_id),
2249 tmp_admin_keyring.name: '/etc/ceph/ceph.client.admin.keyring:z',
2250 tmp_config.name: '/etc/ceph/ceph.conf:z',
2251 },
2252 )
2253
2254 # wait for the service to become available
2255 def is_mon_available():
2256 # type: () -> bool
2257 timeout=args.timeout if args.timeout else 30 # seconds
2258 out, err, ret = call(c.run_cmd(),
2259 desc=c.entrypoint,
2260 timeout=timeout)
2261 return ret == 0
2262 is_available('mon', is_mon_available)
2263
2264 # assimilate and minimize config
2265 if not args.no_minimize_config:
2266 logger.info('Assimilating anything we can from ceph.conf...')
2267 cli([
2268 'config', 'assimilate-conf',
2269 '-i', '/var/lib/ceph/mon/ceph-%s/config' % mon_id
2270 ], {
2271 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % mon_id
2272 })
2273 logger.info('Generating new minimal ceph.conf...')
2274 cli([
2275 'config', 'generate-minimal-conf',
2276 '-o', '/var/lib/ceph/mon/ceph-%s/config' % mon_id
2277 ], {
2278 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % mon_id
2279 })
2280 # re-read our minimized config
2281 with open(mon_dir + '/config', 'r') as f:
2282 config = f.read()
2283 logger.info('Restarting the monitor...')
2284 call_throws([
2285 'systemctl',
2286 'restart',
2287 get_unit_name(fsid, 'mon', mon_id)
2288 ])
2289
2290 if mon_network:
2291 logger.info('Setting mon public_network...')
2292 cli(['config', 'set', 'mon', 'public_network', mon_network])
2293
2294 # create mgr
2295 logger.info('Creating mgr...')
2296 mgr_keyring = '[mgr.%s]\n\tkey = %s\n' % (mgr_id, mgr_key)
2297 mgr_c = get_container(fsid, 'mgr', mgr_id)
2298 deploy_daemon(fsid, 'mgr', mgr_id, mgr_c, uid, gid,
2299 config=config, keyring=mgr_keyring)
2300
2301 # output files
2302 with open(args.output_keyring, 'w') as f:
2303 os.fchmod(f.fileno(), 0o600)
2304 f.write('[client.admin]\n'
2305 '\tkey = ' + admin_key + '\n')
2306 logger.info('Wrote keyring to %s' % args.output_keyring)
2307
2308 with open(args.output_config, 'w') as f:
2309 f.write(config)
2310 logger.info('Wrote config to %s' % args.output_config)
2311
2312 # wait for the service to become available
2313 logger.info('Waiting for mgr to start...')
2314 def is_mgr_available():
2315 # type: () -> bool
2316 timeout=args.timeout if args.timeout else 30 # seconds
2317 out = cli(['status', '-f', 'json-pretty'], timeout=timeout)
2318 j = json.loads(out)
2319 return j.get('mgrmap', {}).get('available', False)
2320 is_available('mgr', is_mgr_available)
2321
2322 # wait for mgr to restart (after enabling a module)
2323 def wait_for_mgr_restart():
2324 # first get latest mgrmap epoch from the mon
2325 out = cli(['mgr', 'dump'])
2326 j = json.loads(out)
2327 epoch = j['epoch']
2328 # wait for mgr to have it
2329 logger.info('Waiting for the mgr to restart...')
2330 def mgr_has_latest_epoch():
2331 # type: () -> bool
2332 try:
2333 out = cli(['tell', 'mgr', 'mgr_status'])
2334 j = json.loads(out)
2335 return j['mgrmap_epoch'] >= epoch
2336 except Exception as e:
2337 logger.debug('tell mgr mgr_status failed: %s' % e)
2338 return False
2339 is_available('Mgr epoch %d' % epoch, mgr_has_latest_epoch)
2340
2341 # ssh
2342 if not args.skip_ssh:
2343 logger.info('Enabling cephadm module...')
2344 cli(['mgr', 'module', 'enable', 'cephadm'])
2345 wait_for_mgr_restart()
2346
2347 logger.info('Setting orchestrator backend to cephadm...')
2348 cli(['orch', 'set', 'backend', 'cephadm'])
2349
2350 logger.info('Generating ssh key...')
2351 cli(['cephadm', 'generate-key'])
2352 ssh_pub = cli(['cephadm', 'get-pub-key'])
2353
2354 with open(args.output_pub_ssh_key, 'w') as f:
2355 f.write(ssh_pub)
2356 logger.info('Wrote public SSH key to to %s' % args.output_pub_ssh_key)
2357
2358 logger.info('Adding key to root@localhost\'s authorized_keys...')
2359 if not os.path.exists('/root/.ssh'):
2360 os.mkdir('/root/.ssh', 0o700)
2361 auth_keys_file = '/root/.ssh/authorized_keys'
2362 add_newline = False
2363 if os.path.exists(auth_keys_file):
2364 with open(auth_keys_file, 'r') as f:
2365 f.seek(0, os.SEEK_END)
2366 if f.tell() > 0:
2367 f.seek(f.tell()-1, os.SEEK_SET) # go to last char
2368 if f.read() != '\n':
2369 add_newline = True
2370 with open(auth_keys_file, 'a') as f:
2371 os.fchmod(f.fileno(), 0o600) # just in case we created it
2372 if add_newline:
2373 f.write('\n')
2374 f.write(ssh_pub.strip() + '\n')
2375
2376 host = get_hostname()
2377 logger.info('Adding host %s...' % host)
2378 cli(['orch', 'host', 'add', host])
2379
2380 if not args.orphan_initial_daemons:
2381 for t in ['mon', 'mgr', 'crash']:
2382 logger.info('Deploying %s service with default placement...' % t)
2383 cli(['orch', 'apply', t])
2384
2385 if not args.skip_monitoring_stack:
2386 logger.info('Enabling mgr prometheus module...')
2387 cli(['mgr', 'module', 'enable', 'prometheus'])
2388 for t in ['prometheus', 'grafana', 'node-exporter', 'alertmanager']:
2389 logger.info('Deploying %s service with default placement...' % t)
2390 cli(['orch', 'apply', t])
2391
2392 if not args.skip_dashboard:
2393 logger.info('Enabling the dashboard module...')
2394 cli(['mgr', 'module', 'enable', 'dashboard'])
2395 wait_for_mgr_restart()
2396
2397 # dashboard crt and key
2398 if args.dashboard_key and args.dashboard_crt:
2399 logger.info('Using provided dashboard certificate...')
2400 mounts = {}
2401 mounts[pathify(args.dashboard_crt)] = '/tmp/dashboard.crt:z'
2402 mounts[pathify(args.dashboard_key)] = '/tmp/dashboard.key:z'
2403 cli(['dashboard', 'set-ssl-certificate', '-i', '/tmp/dashboard.crt'], extra_mounts=mounts)
2404 cli(['dashboard', 'set-ssl-certificate-key', '-i', '/tmp/dashboard.key'], extra_mounts=mounts)
2405 else:
2406 logger.info('Generating a dashboard self-signed certificate...')
2407 cli(['dashboard', 'create-self-signed-cert'])
2408
2409 logger.info('Creating initial admin user...')
2410 password = args.initial_dashboard_password or generate_password()
2411 cmd = ['dashboard', 'ac-user-create', args.initial_dashboard_user, password, 'administrator', '--force-password']
2412 if not args.dashboard_password_noupdate:
2413 cmd.append('--pwd-update-required')
2414 cli(cmd)
2415 logger.info('Fetching dashboard port number...')
2416 out = cli(['config', 'get', 'mgr', 'mgr/dashboard/ssl_server_port'])
2417 port = int(out)
2418
2419 logger.info('Ceph Dashboard is now available at:\n\n'
2420 '\t URL: https://%s:%s/\n'
2421 '\t User: %s\n'
2422 '\tPassword: %s\n' % (
2423 get_fqdn(), port,
2424 args.initial_dashboard_user,
2425 password))
2426
2427 logger.info('You can access the Ceph CLI with:\n\n'
2428 '\tsudo %s shell --fsid %s -c %s -k %s\n' % (
2429 sys.argv[0],
2430 fsid,
2431 args.output_config,
2432 args.output_keyring))
2433 logger.info('Please consider enabling telemetry to help improve Ceph:\n\n'
2434 '\tceph telemetry on\n\n'
2435 'For more information see:\n\n'
2436 '\thttps://docs.ceph.com/docs/master/mgr/telemetry/\n')
2437 logger.info('Bootstrap complete.')
2438 return 0
2439
2440 ##################################
2441
2442 def extract_uid_gid_monitoring(daemon_type):
2443 # type: (str) -> Tuple[int, int]
2444
2445 if daemon_type == 'prometheus':
2446 uid, gid = extract_uid_gid(file_path='/etc/prometheus')
2447 elif daemon_type == 'node-exporter':
2448 uid, gid = 65534, 65534
2449 elif daemon_type == 'grafana':
2450 uid, gid = extract_uid_gid(file_path='/var/lib/grafana')
2451 elif daemon_type == 'alertmanager':
2452 uid, gid = extract_uid_gid(file_path='/etc/alertmanager')
2453 else:
2454 raise Error("{} not implemented yet".format(daemon_type))
2455 return uid, gid
2456
2457
2458 @default_image
2459 def command_deploy():
2460 # type: () -> None
2461 (daemon_type, daemon_id) = args.name.split('.', 1)
2462
2463 l = FileLock(args.fsid)
2464 l.acquire()
2465
2466 if daemon_type not in get_supported_daemons():
2467 raise Error('daemon type %s not recognized' % daemon_type)
2468
2469 logger.info('Deploying daemon %s.%s ...' % (daemon_type, daemon_id))
2470
2471 if daemon_type in Ceph.daemons:
2472 (config, keyring) = get_config_and_keyring()
2473 (uid, gid) = extract_uid_gid()
2474 make_var_run(args.fsid, uid, gid)
2475 c = get_container(args.fsid, daemon_type, daemon_id,
2476 ptrace=args.allow_ptrace)
2477 deploy_daemon(args.fsid, daemon_type, daemon_id, c, uid, gid,
2478 config=config, keyring=keyring,
2479 osd_fsid=args.osd_fsid,
2480 reconfig=args.reconfig)
2481
2482 elif daemon_type in Monitoring.components:
2483 # monitoring daemon - prometheus, grafana, alertmanager, node-exporter
2484 monitoring_args = [] # type: List[str]
2485
2486 # Default Checks
2487 if not args.reconfig:
2488 daemon_ports = Monitoring.port_map[daemon_type] # type: List[int]
2489 if any([port_in_use(port) for port in daemon_ports]):
2490 raise Error("TCP Port(s) '{}' required for {} is already in use".format(",".join(map(str, daemon_ports)), daemon_type))
2491
2492 # make sure provided config-json is sufficient
2493 config = get_parm(args.config_json) # type: ignore
2494 required_files = Monitoring.components[daemon_type].get('config-json-files', list())
2495 required_args = Monitoring.components[daemon_type].get('config-json-args', list())
2496 if required_files:
2497 if not config or not all(c in config.get('files', {}).keys() for c in required_files): # type: ignore
2498 raise Error("{} deployment requires config-json which must "
2499 "contain file content for {}".format(daemon_type.capitalize(), ', '.join(required_files)))
2500 if required_args:
2501 if not config or not all(c in config.keys() for c in required_args): # type: ignore
2502 raise Error("{} deployment requires config-json which must "
2503 "contain arg for {}".format(daemon_type.capitalize(), ', '.join(required_args)))
2504
2505
2506 uid, gid = extract_uid_gid_monitoring(daemon_type)
2507 c = get_container(args.fsid, daemon_type, daemon_id)
2508 deploy_daemon(args.fsid, daemon_type, daemon_id, c, uid, gid,
2509 reconfig=args.reconfig)
2510
2511 elif daemon_type == NFSGanesha.daemon_type:
2512 NFSGanesha.port_in_use()
2513 (config, keyring) = get_config_and_keyring()
2514 # TODO: extract ganesha uid/gid (997, 994) ?
2515 (uid, gid) = extract_uid_gid()
2516 c = get_container(args.fsid, daemon_type, daemon_id)
2517 deploy_daemon(args.fsid, daemon_type, daemon_id, c, uid, gid,
2518 config=config, keyring=keyring,
2519 reconfig=args.reconfig)
2520 else:
2521 raise Error("{} not implemented in command_deploy function".format(daemon_type))
2522
2523 ##################################
2524
2525 @infer_image
2526 def command_run():
2527 # type: () -> int
2528 (daemon_type, daemon_id) = args.name.split('.', 1)
2529 c = get_container(args.fsid, daemon_type, daemon_id)
2530 command = c.run_cmd()
2531 return call_timeout(command, args.timeout)
2532
2533 ##################################
2534
2535 @infer_fsid
2536 @infer_image
2537 def command_shell():
2538 # type: () -> int
2539 if args.fsid:
2540 make_log_dir(args.fsid)
2541 if args.name:
2542 if '.' in args.name:
2543 (daemon_type, daemon_id) = args.name.split('.', 1)
2544 else:
2545 daemon_type = args.name
2546 daemon_id = None
2547 else:
2548 daemon_type = 'osd' # get the most mounts
2549 daemon_id = None
2550
2551 if daemon_id and not args.fsid:
2552 raise Error('must pass --fsid to specify cluster')
2553
2554 # use /etc/ceph files by default, if present. we do this instead of
2555 # making these defaults in the arg parser because we don't want an error
2556 # if they don't exist.
2557 if not args.config and os.path.exists(SHELL_DEFAULT_CONF):
2558 args.config = SHELL_DEFAULT_CONF
2559 if not args.keyring and os.path.exists(SHELL_DEFAULT_KEYRING):
2560 args.keyring = SHELL_DEFAULT_KEYRING
2561
2562 container_args = [] # type: List[str]
2563 mounts = get_container_mounts(args.fsid, daemon_type, daemon_id,
2564 no_config=True if args.config else False)
2565 if args.config:
2566 mounts[pathify(args.config)] = '/etc/ceph/ceph.conf:z'
2567 if args.keyring:
2568 mounts[pathify(args.keyring)] = '/etc/ceph/ceph.keyring:z'
2569 if args.command:
2570 command = args.command
2571 else:
2572 command = ['bash']
2573 container_args += [
2574 '-it',
2575 '-e', 'LANG=C',
2576 '-e', "PS1=%s" % CUSTOM_PS1,
2577 ]
2578 if args.fsid:
2579 home = os.path.join(args.data_dir, args.fsid, 'home')
2580 if not os.path.exists(home):
2581 logger.debug('Creating root home at %s' % home)
2582 makedirs(home, 0, 0, 0o660)
2583 if os.path.exists('/etc/skel'):
2584 for f in os.listdir('/etc/skel'):
2585 if f.startswith('.bash'):
2586 shutil.copyfile(os.path.join('/etc/skel', f),
2587 os.path.join(home, f))
2588 mounts[home] = '/root'
2589
2590 c = CephContainer(
2591 image=args.image,
2592 entrypoint='doesnotmatter',
2593 args=[],
2594 container_args=container_args,
2595 volume_mounts=mounts,
2596 envs=args.env,
2597 privileged=True)
2598 command = c.shell_cmd(command)
2599
2600 return call_timeout(command, args.timeout)
2601
2602 ##################################
2603
2604 @infer_fsid
2605 def command_enter():
2606 # type: () -> int
2607 if not args.fsid:
2608 raise Error('must pass --fsid to specify cluster')
2609 (daemon_type, daemon_id) = args.name.split('.', 1)
2610 container_args = [] # type: List[str]
2611 if args.command:
2612 command = args.command
2613 else:
2614 command = ['sh']
2615 container_args += [
2616 '-it',
2617 '-e', 'LANG=C',
2618 '-e', "PS1=%s" % CUSTOM_PS1,
2619 ]
2620 c = get_container(args.fsid, daemon_type, daemon_id,
2621 container_args=container_args)
2622 command = c.exec_cmd(command)
2623 return call_timeout(command, args.timeout)
2624
2625 ##################################
2626
2627 @infer_fsid
2628 @infer_image
2629 def command_ceph_volume():
2630 # type: () -> None
2631 if args.fsid:
2632 make_log_dir(args.fsid)
2633
2634 (uid, gid) = (0, 0) # ceph-volume runs as root
2635 mounts = get_container_mounts(args.fsid, 'osd', None)
2636
2637 tmp_config = None
2638 tmp_keyring = None
2639
2640 if args.config_json:
2641 # note: this will always pull from args.config_json (we
2642 # require it) and never args.config or args.keyring.
2643 (config, keyring) = get_config_and_keyring()
2644
2645 # tmp keyring file
2646 tmp_keyring = write_tmp(keyring, uid, gid)
2647
2648 # tmp config file
2649 tmp_config = write_tmp(config, uid, gid)
2650
2651 mounts[tmp_config.name] = '/etc/ceph/ceph.conf:z'
2652 mounts[tmp_keyring.name] = '/var/lib/ceph/bootstrap-osd/ceph.keyring:z'
2653
2654 c = CephContainer(
2655 image=args.image,
2656 entrypoint='/usr/sbin/ceph-volume',
2657 args=args.command,
2658 privileged=True,
2659 volume_mounts=mounts,
2660 )
2661 out, err, code = call_throws(c.run_cmd(), verbose=True)
2662 if not code:
2663 print(out)
2664
2665 ##################################
2666
2667 @infer_fsid
2668 def command_unit():
2669 # type: () -> None
2670 if not args.fsid:
2671 raise Error('must pass --fsid to specify cluster')
2672 (daemon_type, daemon_id) = args.name.split('.', 1)
2673 unit_name = get_unit_name(args.fsid, daemon_type, daemon_id)
2674 call_throws([
2675 'systemctl',
2676 args.command,
2677 unit_name])
2678
2679 ##################################
2680
2681 @infer_fsid
2682 def command_logs():
2683 # type: () -> None
2684 if not args.fsid:
2685 raise Error('must pass --fsid to specify cluster')
2686
2687 (daemon_type, daemon_id) = args.name.split('.', 1)
2688 unit_name = get_unit_name(args.fsid, daemon_type, daemon_id)
2689
2690 cmd = [find_program('journalctl')]
2691 cmd.extend(['-u', unit_name])
2692 if args.command:
2693 cmd.extend(args.command)
2694
2695 # call this directly, without our wrapper, so that we get an unmolested
2696 # stdout with logger prefixing.
2697 logger.debug("Running command: %s" % ' '.join(cmd))
2698 subprocess.call(cmd) # type: ignore
2699
2700 ##################################
2701
2702 def list_networks():
2703 # type: () -> Dict[str,List[str]]
2704
2705 ## sadly, 18.04's iproute2 4.15.0-2ubun doesn't support the -j flag,
2706 ## so we'll need to use a regex to parse 'ip' command output.
2707 #out, _, _ = call_throws(['ip', '-j', 'route', 'ls'])
2708 #j = json.loads(out)
2709 #for x in j:
2710
2711 out, _, _ = call_throws([find_executable('ip'), 'route', 'ls'])
2712 return _parse_ip_route(out)
2713
2714 def _parse_ip_route(out):
2715 r = {} # type: Dict[str,List[str]]
2716 p = re.compile(r'^(\S+) (.*)scope link (.*)src (\S+)')
2717 for line in out.splitlines():
2718 m = p.findall(line)
2719 if not m:
2720 continue
2721 net = m[0][0]
2722 ip = m[0][3]
2723 if net not in r:
2724 r[net] = []
2725 r[net].append(ip)
2726 return r
2727
2728 def command_list_networks():
2729 # type: () -> None
2730 r = list_networks()
2731 print(json.dumps(r, indent=4))
2732
2733 ##################################
2734
2735 def command_ls():
2736 # type: () -> None
2737 ls = list_daemons(detail=not args.no_detail,
2738 legacy_dir=args.legacy_dir)
2739 print(json.dumps(ls, indent=4))
2740
2741 def list_daemons(detail=True, legacy_dir=None):
2742 # type: (bool, Optional[str]) -> List[Dict[str, str]]
2743 host_version = None
2744 ls = []
2745
2746 data_dir = args.data_dir
2747 if legacy_dir is not None:
2748 data_dir = os.path.abspath(legacy_dir + data_dir)
2749
2750 # keep track of ceph versions we see
2751 seen_versions = {} # type: Dict[str, Optional[str]]
2752
2753 # /var/lib/ceph
2754 if os.path.exists(data_dir):
2755 for i in os.listdir(data_dir):
2756 if i in ['mon', 'osd', 'mds', 'mgr']:
2757 daemon_type = i
2758 for j in os.listdir(os.path.join(data_dir, i)):
2759 if '-' not in j:
2760 continue
2761 (cluster, daemon_id) = j.split('-', 1)
2762 fsid = get_legacy_daemon_fsid(
2763 cluster, daemon_type, daemon_id,
2764 legacy_dir=legacy_dir)
2765 i = {
2766 'style': 'legacy',
2767 'name': '%s.%s' % (daemon_type, daemon_id),
2768 'fsid': fsid if fsid is not None else 'unknown',
2769 }
2770 if detail:
2771 (i['enabled'], i['state'], _) = check_unit(
2772 'ceph-%s@%s' % (daemon_type, daemon_id))
2773 if not host_version:
2774 try:
2775 out, err, code = call(['ceph', '-v'])
2776 if not code and out.startswith('ceph version '):
2777 host_version = out.split(' ')[2]
2778 except Exception:
2779 pass
2780 i['host_version'] = host_version
2781 ls.append(i)
2782 elif is_fsid(i):
2783 fsid = str(i) # convince mypy that fsid is a str here
2784 for j in os.listdir(os.path.join(data_dir, i)):
2785 if '.' in j:
2786 name = j
2787 (daemon_type, daemon_id) = j.split('.', 1)
2788 unit_name = get_unit_name(fsid,
2789 daemon_type,
2790 daemon_id)
2791 else:
2792 continue
2793 i = {
2794 'style': 'cephadm:v1',
2795 'name': name,
2796 'fsid': fsid,
2797 }
2798 if detail:
2799 # get container id
2800 (i['enabled'], i['state'], _) = check_unit(unit_name)
2801 container_id = None
2802 image_name = None
2803 image_id = None
2804 version = None
2805 start_stamp = None
2806
2807 if 'podman' in container_path and get_podman_version() < (1, 6, 2):
2808 image_field = '.ImageID'
2809 else:
2810 image_field = '.Image'
2811
2812 out, err, code = call(
2813 [
2814 container_path, 'inspect',
2815 '--format', '{{.Id}},{{.Config.Image}},{{%s}},{{.Created}},{{index .Config.Labels "io.ceph.version"}}' % image_field,
2816 'ceph-%s-%s' % (fsid, j)
2817 ],
2818 verbose_on_failure=False)
2819 if not code:
2820 (container_id, image_name, image_id, start,
2821 version) = out.strip().split(',')
2822 image_id = normalize_container_id(image_id)
2823 daemon_type = name.split('.', 1)[0]
2824 start_stamp = try_convert_datetime(start)
2825 if not version or '.' not in version:
2826 version = seen_versions.get(image_id, None)
2827 if daemon_type == NFSGanesha.daemon_type:
2828 version = NFSGanesha.get_version(container_id)
2829 elif not version:
2830 if daemon_type in Ceph.daemons:
2831 out, err, code = call(
2832 [container_path, 'exec', container_id,
2833 'ceph', '-v'])
2834 if not code and \
2835 out.startswith('ceph version '):
2836 version = out.split(' ')[2]
2837 seen_versions[image_id] = version
2838 elif daemon_type == 'grafana':
2839 out, err, code = call(
2840 [container_path, 'exec', container_id,
2841 'grafana-server', '-v'])
2842 if not code and \
2843 out.startswith('Version '):
2844 version = out.split(' ')[1]
2845 seen_versions[image_id] = version
2846 elif daemon_type in ['prometheus',
2847 'alertmanager',
2848 'node-exporter']:
2849 cmd = daemon_type.replace('-', '_')
2850 out, err, code = call(
2851 [container_path, 'exec', container_id,
2852 cmd, '--version'])
2853 if not code and \
2854 err.startswith('%s, version ' % cmd):
2855 version = err.split(' ')[2]
2856 seen_versions[image_id] = version
2857 else:
2858 logging.warning('version for unknown daemon type %s' % daemon_type)
2859 else:
2860 vfile = os.path.join(data_dir, fsid, j, 'unit.image') # type: ignore
2861 try:
2862 with open(vfile, 'r') as f:
2863 image_name = f.read().strip() or None
2864 except IOError:
2865 pass
2866 i['container_id'] = container_id
2867 i['container_image_name'] = image_name
2868 i['container_image_id'] = image_id
2869 i['version'] = version
2870 i['started'] = start_stamp
2871 i['created'] = get_file_timestamp(
2872 os.path.join(data_dir, fsid, j, 'unit.created')
2873 )
2874 i['deployed'] = get_file_timestamp(
2875 os.path.join(data_dir, fsid, j, 'unit.image'))
2876 i['configured'] = get_file_timestamp(
2877 os.path.join(data_dir, fsid, j, 'unit.configured'))
2878
2879 ls.append(i)
2880
2881 # /var/lib/rook
2882 # WRITE ME
2883 return ls
2884
2885
2886 ##################################
2887
2888 @default_image
2889 def command_adopt():
2890 # type: () -> None
2891
2892 if not args.skip_pull:
2893 logger.info('Pulling latest %s container...' % args.image)
2894 call_throws([container_path, 'pull', args.image])
2895
2896 (daemon_type, daemon_id) = args.name.split('.', 1)
2897
2898 # legacy check
2899 if args.style != 'legacy':
2900 raise Error('adoption of style %s not implemented' % args.style)
2901
2902 # lock
2903 fsid = get_legacy_daemon_fsid(args.cluster,
2904 daemon_type,
2905 daemon_id,
2906 legacy_dir=args.legacy_dir)
2907 if not fsid:
2908 raise Error('could not detect legacy fsid; set fsid in ceph.conf')
2909 l = FileLock(fsid)
2910 l.acquire()
2911
2912 # call correct adoption
2913 if daemon_type in Ceph.daemons:
2914 command_adopt_ceph(daemon_type, daemon_id, fsid);
2915 elif daemon_type == 'prometheus':
2916 command_adopt_prometheus(daemon_id, fsid)
2917 elif daemon_type == 'grafana':
2918 command_adopt_grafana(daemon_id, fsid)
2919 elif daemon_type == 'node-exporter':
2920 raise Error('adoption of node-exporter not implemented')
2921 elif daemon_type == 'alertmanager':
2922 raise Error('adoption of alert-manager not implemented')
2923 else:
2924 raise Error('daemon type %s not recognized' % daemon_type)
2925
2926
2927
2928 def command_adopt_ceph(daemon_type, daemon_id, fsid):
2929 # type: (str, str, str) -> None
2930
2931 (uid, gid) = extract_uid_gid()
2932
2933 data_dir_src = ('/var/lib/ceph/%s/%s-%s' %
2934 (daemon_type, args.cluster, daemon_id))
2935 data_dir_src = os.path.abspath(args.legacy_dir + data_dir_src)
2936
2937 osd_fsid = None
2938 if daemon_type == 'osd':
2939 path = os.path.join(data_dir_src, 'fsid')
2940 try:
2941 with open(path, 'r') as f:
2942 osd_fsid = f.read().strip()
2943 except IOError:
2944 raise Error('unable to read OSD fsid from %s' % path)
2945 os_type = None
2946 if os.path.exists(os.path.join(data_dir_src, 'type')):
2947 with open(os.path.join(data_dir_src, 'type')) as f:
2948 os_type = f.read().strip()
2949 else:
2950 raise Error('"type" file missing for OSD data dir')
2951 logger.info('objectstore_type is %s' % os_type)
2952 if os_type == 'filestore':
2953 raise Error('FileStore is not supported by cephadm')
2954
2955 # NOTE: implicit assumption here that the units correspond to the
2956 # cluster we are adopting based on the /etc/{defaults,sysconfig}/ceph
2957 # CLUSTER field.
2958 unit_name = 'ceph-%s@%s' % (daemon_type, daemon_id)
2959 (enabled, state, _) = check_unit(unit_name)
2960 if state == 'running':
2961 logger.info('Stopping old systemd unit %s...' % unit_name)
2962 call_throws(['systemctl', 'stop', unit_name])
2963 if enabled:
2964 logger.info('Disabling old systemd unit %s...' % unit_name)
2965 call_throws(['systemctl', 'disable', unit_name])
2966
2967 # data
2968 logger.info('Moving data...')
2969 data_dir_dst = make_data_dir(fsid, daemon_type, daemon_id,
2970 uid=uid, gid=gid)
2971 move_files(glob(os.path.join(data_dir_src, '*')),
2972 data_dir_dst,
2973 uid=uid, gid=gid)
2974 logger.debug('Remove dir \'%s\'' % (data_dir_src))
2975 if os.path.ismount(data_dir_src):
2976 call_throws(['umount', data_dir_src])
2977 os.rmdir(data_dir_src)
2978
2979 logger.info('Chowning content...')
2980 call_throws(['chown', '-c', '-R', '%d.%d' % (uid, gid), data_dir_dst])
2981
2982 if daemon_type == 'mon':
2983 # rename *.ldb -> *.sst, in case they are coming from ubuntu
2984 store = os.path.join(data_dir_dst, 'store.db')
2985 num_renamed = 0
2986 if os.path.exists(store):
2987 for oldf in os.listdir(store):
2988 if oldf.endswith('.ldb'):
2989 newf = oldf.replace('.ldb', '.sst')
2990 oldp = os.path.join(store, oldf)
2991 newp = os.path.join(store, newf)
2992 logger.debug('Renaming %s -> %s' % (oldp, newp))
2993 os.rename(oldp, newp)
2994 if num_renamed:
2995 logger.info('Renamed %d leveldb *.ldb files to *.sst',
2996 num_renamed)
2997 if daemon_type == 'osd':
2998 for n in ['block', 'block.db', 'block.wal']:
2999 p = os.path.join(data_dir_dst, n)
3000 if os.path.exists(p):
3001 logger.info('Chowning %s...' % p)
3002 os.chown(p, uid, gid)
3003 # disable the ceph-volume 'simple' mode files on the host
3004 simple_fn = os.path.join('/etc/ceph/osd',
3005 '%s-%s.json' % (daemon_id, osd_fsid))
3006 if os.path.exists(simple_fn):
3007 new_fn = simple_fn + '.adopted-by-cephadm'
3008 logger.info('Renaming %s -> %s', simple_fn, new_fn)
3009 os.rename(simple_fn, new_fn)
3010 logger.info('Disabling host unit ceph-volume@ simple unit...')
3011 call_throws(['systemctl', 'disable',
3012 'ceph-volume@simple-%s-%s.service' % (
3013 daemon_id, osd_fsid)])
3014 else:
3015 # assume this is an 'lvm' c-v for now, but don't error
3016 # out if it's not.
3017 logger.info('Disabling host unit ceph-volume@ lvm unit...')
3018 call(['systemctl', 'disable',
3019 'ceph-volume@lvm-%s-%s.service' % (daemon_id, osd_fsid)])
3020
3021 # config
3022 config_src = '/etc/ceph/%s.conf' % (args.cluster)
3023 config_src = os.path.abspath(args.legacy_dir + config_src)
3024 config_dst = os.path.join(data_dir_dst, 'config')
3025 copy_files([config_src], config_dst, uid=uid, gid=gid)
3026
3027 # logs
3028 logger.info('Moving logs...')
3029 log_dir_src = ('/var/log/ceph/%s-%s.%s.log*' %
3030 (args.cluster, daemon_type, daemon_id))
3031 log_dir_src = os.path.abspath(args.legacy_dir + log_dir_src)
3032 log_dir_dst = make_log_dir(fsid, uid=uid, gid=gid)
3033 move_files(glob(log_dir_src),
3034 log_dir_dst,
3035 uid=uid, gid=gid)
3036
3037 logger.info('Creating new units...')
3038 make_var_run(fsid, uid, gid)
3039 c = get_container(fsid, daemon_type, daemon_id)
3040 deploy_daemon_units(fsid, uid, gid, daemon_type, daemon_id, c,
3041 enable=True, # unconditionally enable the new unit
3042 start=(state == 'running'),
3043 osd_fsid=osd_fsid)
3044 update_firewalld(daemon_type)
3045
3046
3047 def command_adopt_prometheus(daemon_id, fsid):
3048 # type: (str, str) -> None
3049
3050 daemon_type = 'prometheus'
3051 (uid, gid) = extract_uid_gid_monitoring(daemon_type)
3052
3053 _stop_and_disable('prometheus')
3054
3055 data_dir_dst = make_data_dir(fsid, daemon_type, daemon_id,
3056 uid=uid, gid=gid)
3057
3058 # config
3059 config_src = '/etc/prometheus/prometheus.yml'
3060 config_src = os.path.abspath(args.legacy_dir + config_src)
3061 config_dst = os.path.join(data_dir_dst, 'etc/prometheus')
3062 copy_files([config_src], config_dst, uid=uid, gid=gid)
3063
3064 # data
3065 data_src = '/var/lib/prometheus/metrics/'
3066 data_src = os.path.abspath(args.legacy_dir + data_src)
3067 data_dst = os.path.join(data_dir_dst, 'data')
3068 copy_tree([data_src], data_dst, uid=uid, gid=gid)
3069
3070 make_var_run(fsid, uid, gid)
3071 c = get_container(fsid, daemon_type, daemon_id)
3072 deploy_daemon(fsid, daemon_type, daemon_id, c, uid, gid)
3073 update_firewalld(daemon_type)
3074
3075 def command_adopt_grafana(daemon_id, fsid):
3076 # type: (str, str) -> None
3077
3078 daemon_type = 'grafana'
3079 (uid, gid) = extract_uid_gid_monitoring(daemon_type)
3080
3081 _stop_and_disable('grafana-server')
3082
3083 data_dir_dst = make_data_dir(fsid, daemon_type, daemon_id,
3084 uid=uid, gid=gid)
3085
3086 # config
3087 config_src = '/etc/grafana/grafana.ini'
3088 config_src = os.path.abspath(args.legacy_dir + config_src)
3089 config_dst = os.path.join(data_dir_dst, 'etc/grafana')
3090 makedirs(config_dst, uid, gid, 0o755)
3091 copy_files([config_src], config_dst, uid=uid, gid=gid)
3092
3093 prov_src = '/etc/grafana/provisioning/'
3094 prov_src = os.path.abspath(args.legacy_dir + prov_src)
3095 prov_dst = os.path.join(data_dir_dst, 'etc/grafana')
3096 copy_tree([prov_src], prov_dst, uid=uid, gid=gid)
3097
3098 # cert
3099 cert = '/etc/grafana/grafana.crt'
3100 key = '/etc/grafana/grafana.key'
3101 if os.path.exists(cert) and os.path.exists(key):
3102 cert_src = '/etc/grafana/grafana.crt'
3103 cert_src = os.path.abspath(args.legacy_dir + cert_src)
3104 makedirs(os.path.join(data_dir_dst, 'etc/grafana/certs'), uid, gid, 0o755)
3105 cert_dst = os.path.join(data_dir_dst, 'etc/grafana/certs/cert_file')
3106 copy_files([cert_src], cert_dst, uid=uid, gid=gid)
3107
3108 key_src = '/etc/grafana/grafana.key'
3109 key_src = os.path.abspath(args.legacy_dir + key_src)
3110 key_dst = os.path.join(data_dir_dst, 'etc/grafana/certs/cert_key')
3111 copy_files([key_src], key_dst, uid=uid, gid=gid)
3112
3113 _adjust_grafana_ini(os.path.join(config_dst, 'grafana.ini'))
3114 else:
3115 logger.debug("Skipping ssl, missing cert {} or key {}".format(cert, key))
3116
3117
3118 # data - possible custom dashboards/plugins
3119 data_src = '/var/lib/grafana/'
3120 data_src = os.path.abspath(args.legacy_dir + data_src)
3121 data_dst = os.path.join(data_dir_dst, 'data')
3122 copy_tree([data_src], data_dst, uid=uid, gid=gid)
3123
3124 make_var_run(fsid, uid, gid)
3125 c = get_container(fsid, daemon_type, daemon_id)
3126 deploy_daemon(fsid, daemon_type, daemon_id, c, uid, gid)
3127 update_firewalld(daemon_type)
3128
3129 def _adjust_grafana_ini(filename):
3130 # type: (str) -> None
3131
3132 # Update cert_file, cert_key pathnames in server section
3133 # ConfigParser does not preserve comments
3134 try:
3135 with open(filename, "r") as grafana_ini:
3136 lines = grafana_ini.readlines()
3137 with open("{}.new".format(filename), "w") as grafana_ini:
3138 server_section=False
3139 for line in lines:
3140 if line.startswith('['):
3141 server_section=False
3142 if line.startswith('[server]'):
3143 server_section=True
3144 if server_section:
3145 line = re.sub(r'^cert_file.*',
3146 'cert_file = /etc/grafana/certs/cert_file', line)
3147 line = re.sub(r'^cert_key.*',
3148 'cert_key = /etc/grafana/certs/cert_key', line)
3149 grafana_ini.write(line)
3150 os.rename("{}.new".format(filename), filename)
3151 except OSError as err:
3152 raise Error("Cannot update {}: {}".format(filename, err))
3153
3154
3155 def _stop_and_disable(unit_name):
3156 # type: (str) -> None
3157
3158 (enabled, state, _) = check_unit(unit_name)
3159 if state == 'running':
3160 logger.info('Stopping old systemd unit %s...' % unit_name)
3161 call_throws(['systemctl', 'stop', unit_name])
3162 if enabled:
3163 logger.info('Disabling old systemd unit %s...' % unit_name)
3164 call_throws(['systemctl', 'disable', unit_name])
3165
3166
3167 ##################################
3168
3169 def command_rm_daemon():
3170 # type: () -> None
3171
3172 l = FileLock(args.fsid)
3173 l.acquire()
3174
3175 (daemon_type, daemon_id) = args.name.split('.', 1)
3176 if daemon_type in ['mon', 'osd'] and not args.force:
3177 raise Error('must pass --force to proceed: '
3178 'this command may destroy precious data!')
3179 unit_name = get_unit_name(args.fsid, daemon_type, daemon_id)
3180 call(['systemctl', 'stop', unit_name],
3181 verbose_on_failure=False)
3182 call(['systemctl', 'reset-failed', unit_name],
3183 verbose_on_failure=False)
3184 call(['systemctl', 'disable', unit_name],
3185 verbose_on_failure=False)
3186 data_dir = get_data_dir(args.fsid, daemon_type, daemon_id)
3187 if daemon_type in ['mon', 'osd', 'prometheus'] and \
3188 not args.force_delete_data:
3189 # rename it out of the way -- do not delete
3190 backup_dir = os.path.join(args.data_dir, args.fsid, 'removed')
3191 if not os.path.exists(backup_dir):
3192 makedirs(backup_dir, 0, 0, DATA_DIR_MODE)
3193 dirname = '%s.%s_%s' % (daemon_type, daemon_id,
3194 datetime.datetime.utcnow().strftime(DATEFMT))
3195 os.rename(data_dir,
3196 os.path.join(backup_dir, dirname))
3197 else:
3198 call_throws(['rm', '-rf', data_dir])
3199
3200 ##################################
3201
3202 def command_rm_cluster():
3203 # type: () -> None
3204 if not args.force:
3205 raise Error('must pass --force to proceed: '
3206 'this command may destroy precious data!')
3207
3208 l = FileLock(args.fsid)
3209 l.acquire()
3210
3211 # stop + disable individual daemon units
3212 for d in list_daemons(detail=False):
3213 if d['fsid'] != args.fsid:
3214 continue
3215 if d['style'] != 'cephadm:v1':
3216 continue
3217 unit_name = get_unit_name(args.fsid, d['name'])
3218 call(['systemctl', 'stop', unit_name],
3219 verbose_on_failure=False)
3220 call(['systemctl', 'reset-failed', unit_name],
3221 verbose_on_failure=False)
3222 call(['systemctl', 'disable', unit_name],
3223 verbose_on_failure=False)
3224
3225 # cluster units
3226 for unit_name in ['ceph-%s.target' % args.fsid]:
3227 call(['systemctl', 'stop', unit_name],
3228 verbose_on_failure=False)
3229 call(['systemctl', 'reset-failed', unit_name],
3230 verbose_on_failure=False)
3231 call(['systemctl', 'disable', unit_name],
3232 verbose_on_failure=False)
3233
3234 slice_name = 'system-%s.slice' % (('ceph-%s' % args.fsid).replace('-',
3235 '\\x2d'))
3236 call(['systemctl', 'stop', slice_name],
3237 verbose_on_failure=False)
3238
3239 # rm units
3240 call_throws(['rm', '-f', args.unit_dir +
3241 '/ceph-%s@.service' % args.fsid])
3242 call_throws(['rm', '-f', args.unit_dir +
3243 '/ceph-%s.target' % args.fsid])
3244 call_throws(['rm', '-rf',
3245 args.unit_dir + '/ceph-%s.target.wants' % args.fsid])
3246 # rm data
3247 call_throws(['rm', '-rf', args.data_dir + '/' + args.fsid])
3248 # rm logs
3249 call_throws(['rm', '-rf', args.log_dir + '/' + args.fsid])
3250 call_throws(['rm', '-rf', args.log_dir +
3251 '/*.wants/ceph-%s@*' % args.fsid])
3252 # rm logrotate config
3253 call_throws(['rm', '-f', args.logrotate_dir + '/ceph-%s' % args.fsid])
3254
3255
3256 ##################################
3257
3258 def check_time_sync(enabler=None):
3259 # type: (Optional[Packager]) -> bool
3260 units = [
3261 'chrony.service', # 18.04 (at least)
3262 'chronyd.service', # el / opensuse
3263 'systemd-timesyncd.service',
3264 'ntpd.service', # el7 (at least)
3265 'ntp.service', # 18.04 (at least)
3266 ]
3267 if not check_units(units, enabler=None):
3268 logger.warning('No time sync service is running; checked for %s' % units)
3269 return False
3270 return True
3271
3272 def command_check_host():
3273 # type: () -> None
3274 # caller already checked for docker/podman
3275 logger.info('podman|docker (%s) is present' % container_path)
3276
3277 commands = ['systemctl', 'lvcreate']
3278
3279 for command in commands:
3280 try:
3281 find_program(command)
3282 logger.info('%s is present' % command)
3283 except ValueError:
3284 raise Error('%s binary does not appear to be installed' % command)
3285
3286 # check for configured+running chronyd or ntp
3287 if not check_time_sync():
3288 raise Error('No time synchronization is active')
3289
3290 if 'expect_hostname' in args and args.expect_hostname:
3291 if get_hostname() != args.expect_hostname:
3292 raise Error('hostname "%s" does not match expected hostname "%s"' % (
3293 get_hostname(), args.expect_hostname))
3294 logger.info('Hostname "%s" matches what is expected.',
3295 args.expect_hostname)
3296
3297 logger.info('Host looks OK')
3298
3299 ##################################
3300
3301 def command_prepare_host():
3302 # type: () -> None
3303 logger.info('Verifying podman|docker is present...')
3304 pkg = None
3305 if not container_path:
3306 if not pkg:
3307 pkg = create_packager()
3308 pkg.install_podman()
3309
3310 logger.info('Verifying lvm2 is present...')
3311 if not find_executable('lvcreate'):
3312 if not pkg:
3313 pkg = create_packager()
3314 pkg.install(['lvm2'])
3315
3316 logger.info('Verifying time synchronization is in place...')
3317 if not check_time_sync():
3318 if not pkg:
3319 pkg = create_packager()
3320 pkg.install(['chrony'])
3321 # check again, and this time try to enable
3322 # the service
3323 check_time_sync(enabler=pkg)
3324
3325 if 'expect_hostname' in args and args.expect_hostname and args.expect_hostname != get_hostname():
3326 logger.warning('Adjusting hostname from %s -> %s...' % (get_hostname(), args.expect_hostname))
3327 call_throws(['hostname', args.expect_hostname])
3328 with open('/etc/hostname', 'w') as f:
3329 f.write(args.expect_hostname + '\n')
3330
3331 logger.info('Repeating the final host check...')
3332 command_check_host()
3333
3334 ##################################
3335
3336 class CustomValidation(argparse.Action):
3337
3338 def _check_name(self, values):
3339 try:
3340 (daemon_type, daemon_id) = values.split('.', 1)
3341 except ValueError:
3342 raise argparse.ArgumentError(self,
3343 "must be of the format <type>.<id>. For example, osd.1 or prometheus.myhost.com")
3344
3345 daemons = get_supported_daemons()
3346 if daemon_type not in daemons:
3347 raise argparse.ArgumentError(self,
3348 "name must declare the type of daemon e.g. "
3349 "{}".format(', '.join(daemons)))
3350
3351 def __call__(self, parser, namespace, values, option_string=None):
3352 if self.dest == "name":
3353 self._check_name(values)
3354 setattr(namespace, self.dest, values)
3355
3356 ##################################
3357
3358 def get_distro():
3359 distro = None
3360 distro_version = None
3361 distro_codename = None
3362 with open('/etc/os-release', 'r') as f:
3363 for line in f.readlines():
3364 line = line.strip()
3365 if '=' not in line or line.startswith('#'):
3366 continue
3367 (var, val) = line.split('=', 1)
3368 if val[0] == '"' and val[-1] == '"':
3369 val = val[1:-1]
3370 if var == 'ID':
3371 distro = val.lower()
3372 elif var == 'VERSION_ID':
3373 distro_version = val.lower()
3374 elif var == 'VERSION_CODENAME':
3375 distro_codename = val.lower()
3376 return distro, distro_version, distro_codename
3377
3378 class Packager(object):
3379 def __init__(self, stable=None, version=None, branch=None, commit=None):
3380 assert \
3381 (stable and not version and not branch and not commit) or \
3382 (not stable and version and not branch and not commit) or \
3383 (not stable and not version and branch) or \
3384 (not stable and not version and not branch and not commit)
3385 self.stable = stable
3386 self.version = version
3387 self.branch = branch
3388 self.commit = commit
3389
3390 def add_repo(self):
3391 raise NotImplementedError
3392
3393 def rm_repo(self):
3394 raise NotImplementedError
3395
3396 def query_shaman(self, distro, distro_version, branch, commit):
3397 # query shaman
3398 logging.info('Fetching repo metadata from shaman and chacra...')
3399 shaman_url = 'https://shaman.ceph.com/api/repos/ceph/{branch}/{sha1}/{distro}/{distro_version}/repo/?arch={arch}'.format(
3400 distro=distro,
3401 distro_version=distro_version,
3402 branch=branch,
3403 sha1=commit or 'latest',
3404 arch=get_arch()
3405 )
3406 try:
3407 shaman_response = urlopen(shaman_url)
3408 except HTTPError as err:
3409 logging.error('repository not found in shaman (might not be available yet)')
3410 raise Error('%s, failed to fetch %s' % (err, shaman_url))
3411 try:
3412 chacra_url = shaman_response.geturl()
3413 chacra_response = urlopen(chacra_url)
3414 except HTTPError as err:
3415 logging.error('repository not found in chacra (might not be available yet)')
3416 raise Error('%s, failed to fetch %s' % (err, chacra_url))
3417 return chacra_response.read().decode('utf-8')
3418
3419 def repo_gpgkey(self):
3420 if args.gpg_url:
3421 return args.gpg_url
3422 if self.stable or self.version:
3423 return 'https://download.ceph.com/keys/release.asc', 'release'
3424 else:
3425 return 'https://download.ceph.com/keys/autobuild.asc', 'autobuild'
3426
3427 def enable_service(self, service):
3428 """
3429 Start and enable the service (typically using systemd).
3430 """
3431 call_throws(['systemctl', 'enable', '--now', service])
3432
3433
3434 class Apt(Packager):
3435 DISTRO_NAMES = {
3436 'ubuntu': 'ubuntu',
3437 'debian': 'debian',
3438 }
3439
3440 def __init__(self, stable, version, branch, commit,
3441 distro, distro_version, distro_codename):
3442 super(Apt, self).__init__(stable=stable, version=version,
3443 branch=branch, commit=commit)
3444 self.distro = self.DISTRO_NAMES[distro]
3445 self.distro_codename = distro_codename
3446
3447 def repo_path(self):
3448 return '/etc/apt/sources.list.d/ceph.list'
3449
3450 def add_repo(self):
3451 url, name = self.repo_gpgkey()
3452 logging.info('Installing repo GPG key from %s...' % url)
3453 try:
3454 response = urlopen(url)
3455 except HTTPError as err:
3456 logging.error('failed to fetch GPG repo key from %s: %s' % (
3457 url, err))
3458 raise Error('failed to fetch GPG key')
3459 key = response.read().decode('utf-8')
3460 with open('/etc/apt/trusted.gpg.d/ceph.%s.gpg' % name, 'w') as f:
3461 f.write(key)
3462
3463 if self.version:
3464 content = 'deb %s/debian-%s/ %s main\n' % (
3465 args.repo_url, self.version, self.distro_codename)
3466 elif self.stable:
3467 content = 'deb %s/debian-%s/ %s main\n' % (
3468 args.repo_url, self.stable, self.distro_codename)
3469 else:
3470 content = self.query_shaman(self.distro, self.distro_codename, self.branch,
3471 self.commit)
3472
3473 logging.info('Installing repo file at %s...' % self.repo_path())
3474 with open(self.repo_path(), 'w') as f:
3475 f.write(content)
3476
3477 def rm_repo(self):
3478 for name in ['autobuild', 'release']:
3479 p = '/etc/apt/trusted.gpg.d/ceph.%s.gpg' % name
3480 if os.path.exists(p):
3481 logging.info('Removing repo GPG key %s...' % p)
3482 os.unlink(p)
3483 if os.path.exists(self.repo_path()):
3484 logging.info('Removing repo at %s...' % self.repo_path())
3485 os.unlink(self.repo_path())
3486
3487 def install(self, ls):
3488 logging.info('Installing packages %s...' % ls)
3489 call_throws(['apt', 'install', '-y'] + ls)
3490
3491 def install_podman(self):
3492 if self.distro == 'ubuntu':
3493 logging.info('Setting up repo for pdoman...')
3494 self.install(['software-properties-common'])
3495 call_throws(['add-apt-repository', '-y', 'ppa:projectatomic/ppa'])
3496 call_throws(['apt', 'update'])
3497
3498 logging.info('Attempting podman install...')
3499 try:
3500 self.install(['podman'])
3501 except Error as e:
3502 logging.info('Podman did not work. Falling back to docker...')
3503 self.install(['docker.io'])
3504
3505 class YumDnf(Packager):
3506 DISTRO_NAMES = {
3507 'centos': ('centos', 'el'),
3508 'rhel': ('centos', 'el'),
3509 'scientific': ('centos', 'el'),
3510 'fedora': ('fedora', 'fc'),
3511 }
3512
3513 def __init__(self, stable, version, branch, commit,
3514 distro, distro_version):
3515 super(YumDnf, self).__init__(stable=stable, version=version,
3516 branch=branch, commit=commit)
3517 self.major = int(distro_version.split('.')[0])
3518 self.distro_normalized = self.DISTRO_NAMES[distro][0]
3519 self.distro_code = self.DISTRO_NAMES[distro][1] + str(self.major)
3520 if (self.distro_code == 'fc' and self.major >= 30) or \
3521 (self.distro_code == 'el' and self.major >= 8):
3522 self.tool = 'dnf'
3523 else:
3524 self.tool = 'yum'
3525
3526 def custom_repo(self, **kw):
3527 """
3528 Repo files need special care in that a whole line should not be present
3529 if there is no value for it. Because we were using `format()` we could
3530 not conditionally add a line for a repo file. So the end result would
3531 contain a key with a missing value (say if we were passing `None`).
3532
3533 For example, it could look like::
3534
3535 [ceph repo]
3536 name= ceph repo
3537 proxy=
3538 gpgcheck=
3539
3540 Which breaks. This function allows us to conditionally add lines,
3541 preserving an order and be more careful.
3542
3543 Previously, and for historical purposes, this is how the template used
3544 to look::
3545
3546 custom_repo =
3547 [{repo_name}]
3548 name={name}
3549 baseurl={baseurl}
3550 enabled={enabled}
3551 gpgcheck={gpgcheck}
3552 type={_type}
3553 gpgkey={gpgkey}
3554 proxy={proxy}
3555
3556 """
3557 lines = []
3558
3559 # by using tuples (vs a dict) we preserve the order of what we want to
3560 # return, like starting with a [repo name]
3561 tmpl = (
3562 ('reponame', '[%s]'),
3563 ('name', 'name=%s'),
3564 ('baseurl', 'baseurl=%s'),
3565 ('enabled', 'enabled=%s'),
3566 ('gpgcheck', 'gpgcheck=%s'),
3567 ('_type', 'type=%s'),
3568 ('gpgkey', 'gpgkey=%s'),
3569 ('proxy', 'proxy=%s'),
3570 ('priority', 'priority=%s'),
3571 )
3572
3573 for line in tmpl:
3574 tmpl_key, tmpl_value = line # key values from tmpl
3575
3576 # ensure that there is an actual value (not None nor empty string)
3577 if tmpl_key in kw and kw.get(tmpl_key) not in (None, ''):
3578 lines.append(tmpl_value % kw.get(tmpl_key))
3579
3580 return '\n'.join(lines)
3581
3582 def repo_path(self):
3583 return '/etc/yum.repos.d/ceph.repo'
3584
3585 def repo_baseurl(self):
3586 assert self.stable or self.version
3587 if self.version:
3588 return '%s/rpm-%s/%s' % (args.repo_url, self.version,
3589 self.distro_code)
3590 else:
3591 return '%s/rpm-%s/%s' % (args.repo_url, self.stable,
3592 self.distro_code)
3593
3594 def add_repo(self):
3595 if self.stable or self.version:
3596 content = ''
3597 for n, t in {
3598 'Ceph': '$basearch',
3599 'Ceph-noarch': 'noarch',
3600 'Ceph-source': 'SRPMS'}.items():
3601 content += '[%s]\n' % (n)
3602 content += self.custom_repo(
3603 name='Ceph %s' % t,
3604 baseurl=self.repo_baseurl() + '/' + t,
3605 enabled=1,
3606 gpgcheck=1,
3607 gpgkey=self.repo_gpgkey()[0],
3608 )
3609 content += '\n\n'
3610 else:
3611 content = self.query_shaman(self.distro_normalized, self.major,
3612 self.branch,
3613 self.commit)
3614
3615 logging.info('Writing repo to %s...' % self.repo_path())
3616 with open(self.repo_path(), 'w') as f:
3617 f.write(content)
3618
3619 if self.distro_code.startswith('el'):
3620 logger.info('Enabling EPEL...')
3621 call_throws([self.tool, 'install', '-y', 'epel-release'])
3622 if self.distro_code == 'el8':
3623 # we also need Ken's copr repo, at least for now
3624 logger.info('Enabling supplementary copr repo ktdreyer/ceph-el8...')
3625 call_throws(['dnf', 'copr', 'enable', '-y', 'ktdreyer/ceph-el8'])
3626
3627 def rm_repo(self):
3628 if os.path.exists(self.repo_path()):
3629 os.unlink(self.repo_path())
3630 if self.distro_code == 'el8':
3631 logger.info('Disabling supplementary copr repo ktdreyer/ceph-el8...')
3632 call_throws(['dnf', 'copr', 'disable', '-y', 'ktdreyer/ceph-el8'])
3633
3634 def install(self, ls):
3635 logger.info('Installing packages %s...' % ls)
3636 call_throws([self.tool, 'install', '-y'] + ls)
3637
3638 def install_podman(self):
3639 self.install(['podman'])
3640
3641
3642 class Zypper(Packager):
3643 DISTRO_NAMES = [
3644 'sles',
3645 'opensuse-tumbleweed',
3646 'opensuse-leap'
3647 ]
3648
3649 def __init__(self, stable, version, branch, commit,
3650 distro, distro_version):
3651 super(Zypper, self).__init__(stable=stable, version=version,
3652 branch=branch, commit=commit)
3653 self.tool = 'zypper'
3654 self.distro = 'opensuse'
3655 self.distro_version = '15.1'
3656 if 'tumbleweed' not in distro and distro_version is not None:
3657 self.distro_version = distro_version
3658
3659 def custom_repo(self, **kw):
3660 """
3661 See YumDnf for format explanation.
3662 """
3663 lines = []
3664
3665 # by using tuples (vs a dict) we preserve the order of what we want to
3666 # return, like starting with a [repo name]
3667 tmpl = (
3668 ('reponame', '[%s]'),
3669 ('name', 'name=%s'),
3670 ('baseurl', 'baseurl=%s'),
3671 ('enabled', 'enabled=%s'),
3672 ('gpgcheck', 'gpgcheck=%s'),
3673 ('_type', 'type=%s'),
3674 ('gpgkey', 'gpgkey=%s'),
3675 ('proxy', 'proxy=%s'),
3676 ('priority', 'priority=%s'),
3677 )
3678
3679 for line in tmpl:
3680 tmpl_key, tmpl_value = line # key values from tmpl
3681
3682 # ensure that there is an actual value (not None nor empty string)
3683 if tmpl_key in kw and kw.get(tmpl_key) not in (None, ''):
3684 lines.append(tmpl_value % kw.get(tmpl_key))
3685
3686 return '\n'.join(lines)
3687
3688 def repo_path(self):
3689 return '/etc/zypp/repos.d/ceph.repo'
3690
3691 def repo_baseurl(self):
3692 assert self.stable or self.version
3693 if self.version:
3694 return '%s/rpm-%s/%s' % (args.repo_url, self.stable, self.distro)
3695 else:
3696 return '%s/rpm-%s/%s' % (args.repo_url, self.stable, self.distro)
3697
3698 def add_repo(self):
3699 if self.stable or self.version:
3700 content = ''
3701 for n, t in {
3702 'Ceph': '$basearch',
3703 'Ceph-noarch': 'noarch',
3704 'Ceph-source': 'SRPMS'}.items():
3705 content += '[%s]\n' % (n)
3706 content += self.custom_repo(
3707 name='Ceph %s' % t,
3708 baseurl=self.repo_baseurl() + '/' + t,
3709 enabled=1,
3710 gpgcheck=1,
3711 gpgkey=self.repo_gpgkey()[0],
3712 )
3713 content += '\n\n'
3714 else:
3715 content = self.query_shaman(self.distro, self.distro_version,
3716 self.branch,
3717 self.commit)
3718
3719 logging.info('Writing repo to %s...' % self.repo_path())
3720 with open(self.repo_path(), 'w') as f:
3721 f.write(content)
3722
3723 def rm_repo(self):
3724 if os.path.exists(self.repo_path()):
3725 os.unlink(self.repo_path())
3726
3727 def install(self, ls):
3728 logger.info('Installing packages %s...' % ls)
3729 call_throws([self.tool, 'in', '-y'] + ls)
3730
3731 def install_podman(self):
3732 self.install(['podman'])
3733
3734
3735 def create_packager(stable=None, version=None, branch=None, commit=None):
3736 distro, distro_version, distro_codename = get_distro()
3737 if distro in YumDnf.DISTRO_NAMES:
3738 return YumDnf(stable=stable, version=version,
3739 branch=branch, commit=commit,
3740 distro=distro, distro_version=distro_version)
3741 elif distro in Apt.DISTRO_NAMES:
3742 return Apt(stable=stable, version=version,
3743 branch=branch, commit=commit,
3744 distro=distro, distro_version=distro_version,
3745 distro_codename=distro_codename)
3746 elif distro in Zypper.DISTRO_NAMES:
3747 return Zypper(stable=stable, version=version,
3748 branch=branch, commit=commit,
3749 distro=distro, distro_version=distro_version)
3750 raise Error('Distro %s version %s not supported' % (distro, distro_version))
3751
3752
3753 def command_add_repo():
3754 if args.version and args.release:
3755 raise Error('you can specify either --release or --version but not both')
3756 if args.version:
3757 try:
3758 (x, y, z) = args.version.split('.')
3759 except Exception as e:
3760 raise Error('version must be in the form x.y.z (e.g., 15.2.0)')
3761
3762 pkg = create_packager(stable=args.release,
3763 version=args.version,
3764 branch=args.dev,
3765 commit=args.dev_commit)
3766 pkg.add_repo()
3767
3768 def command_rm_repo():
3769 pkg = create_packager()
3770 pkg.rm_repo()
3771
3772 def command_install():
3773 pkg = create_packager()
3774 pkg.install(args.packages)
3775
3776 ##################################
3777
3778 def _get_parser():
3779 # type: () -> argparse.ArgumentParser
3780 parser = argparse.ArgumentParser(
3781 description='Bootstrap Ceph daemons with systemd and containers.',
3782 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
3783 parser.add_argument(
3784 '--image',
3785 help='container image. Can also be set via the "CEPHADM_IMAGE" '
3786 'env var')
3787 parser.add_argument(
3788 '--docker',
3789 action='store_true',
3790 help='use docker instead of podman')
3791 parser.add_argument(
3792 '--data-dir',
3793 default=DATA_DIR,
3794 help='base directory for daemon data')
3795 parser.add_argument(
3796 '--log-dir',
3797 default=LOG_DIR,
3798 help='base directory for daemon logs')
3799 parser.add_argument(
3800 '--logrotate-dir',
3801 default=LOGROTATE_DIR,
3802 help='location of logrotate configuration files')
3803 parser.add_argument(
3804 '--unit-dir',
3805 default=UNIT_DIR,
3806 help='base directory for systemd units')
3807 parser.add_argument(
3808 '--verbose', '-v',
3809 action='store_true',
3810 help='Show debug-level log messages')
3811 parser.add_argument(
3812 '--timeout',
3813 type=int,
3814 default=DEFAULT_TIMEOUT,
3815 help='timeout in seconds')
3816 parser.add_argument(
3817 '--retry',
3818 type=int,
3819 default=DEFAULT_RETRY,
3820 help='max number of retries')
3821
3822 subparsers = parser.add_subparsers(help='sub-command')
3823
3824 parser_version = subparsers.add_parser(
3825 'version', help='get ceph version from container')
3826 parser_version.set_defaults(func=command_version)
3827
3828 parser_pull = subparsers.add_parser(
3829 'pull', help='pull latest image version')
3830 parser_pull.set_defaults(func=command_pull)
3831
3832 parser_inspect_image = subparsers.add_parser(
3833 'inspect-image', help='inspect local container image')
3834 parser_inspect_image.set_defaults(func=command_inspect_image)
3835
3836 parser_ls = subparsers.add_parser(
3837 'ls', help='list daemon instances on this host')
3838 parser_ls.set_defaults(func=command_ls)
3839 parser_ls.add_argument(
3840 '--no-detail',
3841 action='store_true',
3842 help='Do not include daemon status')
3843 parser_ls.add_argument(
3844 '--legacy-dir',
3845 default='/',
3846 help='base directory for legacy daemon data')
3847
3848 parser_list_networks = subparsers.add_parser(
3849 'list-networks', help='list IP networks')
3850 parser_list_networks.set_defaults(func=command_list_networks)
3851
3852 parser_adopt = subparsers.add_parser(
3853 'adopt', help='adopt daemon deployed with a different tool')
3854 parser_adopt.set_defaults(func=command_adopt)
3855 parser_adopt.add_argument(
3856 '--name', '-n',
3857 required=True,
3858 help='daemon name (type.id)')
3859 parser_adopt.add_argument(
3860 '--style',
3861 required=True,
3862 help='deployment style (legacy, ...)')
3863 parser_adopt.add_argument(
3864 '--cluster',
3865 default='ceph',
3866 help='cluster name')
3867 parser_adopt.add_argument(
3868 '--legacy-dir',
3869 default='/',
3870 help='base directory for legacy daemon data')
3871 parser_adopt.add_argument(
3872 '--config-json',
3873 help='Additional configuration information in JSON format')
3874 parser_adopt.add_argument(
3875 '--skip-firewalld',
3876 action='store_true',
3877 help='Do not configure firewalld')
3878 parser_adopt.add_argument(
3879 '--skip-pull',
3880 action='store_true',
3881 help='do not pull the latest image before adopting')
3882
3883 parser_rm_daemon = subparsers.add_parser(
3884 'rm-daemon', help='remove daemon instance')
3885 parser_rm_daemon.set_defaults(func=command_rm_daemon)
3886 parser_rm_daemon.add_argument(
3887 '--name', '-n',
3888 required=True,
3889 action=CustomValidation,
3890 help='daemon name (type.id)')
3891 parser_rm_daemon.add_argument(
3892 '--fsid',
3893 required=True,
3894 help='cluster FSID')
3895 parser_rm_daemon.add_argument(
3896 '--force',
3897 action='store_true',
3898 help='proceed, even though this may destroy valuable data')
3899 parser_rm_daemon.add_argument(
3900 '--force-delete-data',
3901 action='store_true',
3902 help='delete valuable daemon data instead of making a backup')
3903
3904 parser_rm_cluster = subparsers.add_parser(
3905 'rm-cluster', help='remove all daemons for a cluster')
3906 parser_rm_cluster.set_defaults(func=command_rm_cluster)
3907 parser_rm_cluster.add_argument(
3908 '--fsid',
3909 required=True,
3910 help='cluster FSID')
3911 parser_rm_cluster.add_argument(
3912 '--force',
3913 action='store_true',
3914 help='proceed, even though this may destroy valuable data')
3915
3916 parser_run = subparsers.add_parser(
3917 'run', help='run a ceph daemon, in a container, in the foreground')
3918 parser_run.set_defaults(func=command_run)
3919 parser_run.add_argument(
3920 '--name', '-n',
3921 required=True,
3922 help='daemon name (type.id)')
3923 parser_run.add_argument(
3924 '--fsid',
3925 required=True,
3926 help='cluster FSID')
3927
3928 parser_shell = subparsers.add_parser(
3929 'shell', help='run an interactive shell inside a daemon container')
3930 parser_shell.set_defaults(func=command_shell)
3931 parser_shell.add_argument(
3932 '--fsid',
3933 help='cluster FSID')
3934 parser_shell.add_argument(
3935 '--name', '-n',
3936 help='daemon name (type.id)')
3937 parser_shell.add_argument(
3938 '--config', '-c',
3939 help='ceph.conf to pass through to the container')
3940 parser_shell.add_argument(
3941 '--keyring', '-k',
3942 help='ceph.keyring to pass through to the container')
3943 parser_shell.add_argument(
3944 '--env', '-e',
3945 action='append',
3946 default=[],
3947 help='set environment variable')
3948 parser_shell.add_argument(
3949 'command', nargs='*',
3950 help='command (optional)')
3951
3952 parser_enter = subparsers.add_parser(
3953 'enter', help='run an interactive shell inside a running daemon container')
3954 parser_enter.set_defaults(func=command_enter)
3955 parser_enter.add_argument(
3956 '--fsid',
3957 help='cluster FSID')
3958 parser_enter.add_argument(
3959 '--name', '-n',
3960 required=True,
3961 help='daemon name (type.id)')
3962 parser_enter.add_argument(
3963 'command', nargs='*',
3964 help='command')
3965
3966 parser_ceph_volume = subparsers.add_parser(
3967 'ceph-volume', help='run ceph-volume inside a container')
3968 parser_ceph_volume.set_defaults(func=command_ceph_volume)
3969 parser_ceph_volume.add_argument(
3970 '--fsid',
3971 help='cluster FSID')
3972 parser_ceph_volume.add_argument(
3973 '--config-json',
3974 help='JSON file with config and (client.bootrap-osd) key')
3975 parser_ceph_volume.add_argument(
3976 'command', nargs='+',
3977 help='command')
3978
3979 parser_unit = subparsers.add_parser(
3980 'unit', help='operate on the daemon\'s systemd unit')
3981 parser_unit.set_defaults(func=command_unit)
3982 parser_unit.add_argument(
3983 'command',
3984 help='systemd command (start, stop, restart, enable, disable, ...)')
3985 parser_unit.add_argument(
3986 '--fsid',
3987 help='cluster FSID')
3988 parser_unit.add_argument(
3989 '--name', '-n',
3990 required=True,
3991 help='daemon name (type.id)')
3992
3993 parser_logs = subparsers.add_parser(
3994 'logs', help='print journald logs for a daemon container')
3995 parser_logs.set_defaults(func=command_logs)
3996 parser_logs.add_argument(
3997 '--fsid',
3998 help='cluster FSID')
3999 parser_logs.add_argument(
4000 '--name', '-n',
4001 required=True,
4002 help='daemon name (type.id)')
4003 parser_logs.add_argument(
4004 'command', nargs='*',
4005 help='additional journalctl args')
4006
4007 parser_bootstrap = subparsers.add_parser(
4008 'bootstrap', help='bootstrap a cluster (mon + mgr daemons)')
4009 parser_bootstrap.set_defaults(func=command_bootstrap)
4010 parser_bootstrap.add_argument(
4011 '--config', '-c',
4012 help='ceph conf file to incorporate')
4013 parser_bootstrap.add_argument(
4014 '--mon-id',
4015 required=False,
4016 help='mon id (default: local hostname)')
4017 parser_bootstrap.add_argument(
4018 '--mon-addrv',
4019 help='mon IPs (e.g., [v2:localipaddr:3300,v1:localipaddr:6789])')
4020 parser_bootstrap.add_argument(
4021 '--mon-ip',
4022 help='mon IP')
4023 parser_bootstrap.add_argument(
4024 '--mgr-id',
4025 required=False,
4026 help='mgr id (default: randomly generated)')
4027 parser_bootstrap.add_argument(
4028 '--fsid',
4029 help='cluster FSID')
4030 parser_bootstrap.add_argument(
4031 '--output-dir',
4032 default='/etc/ceph',
4033 help='directory to write config, keyring, and pub key files')
4034 parser_bootstrap.add_argument(
4035 '--output-keyring',
4036 help='location to write keyring file with new cluster admin and mon keys')
4037 parser_bootstrap.add_argument(
4038 '--output-config',
4039 help='location to write conf file to connect to new cluster')
4040 parser_bootstrap.add_argument(
4041 '--output-pub-ssh-key',
4042 help='location to write the cluster\'s public SSH key')
4043 parser_bootstrap.add_argument(
4044 '--skip-ssh',
4045 action='store_true',
4046 help='skip setup of ssh key on local host')
4047 parser_bootstrap.add_argument(
4048 '--initial-dashboard-user',
4049 default='admin',
4050 help='Initial user for the dashboard')
4051 parser_bootstrap.add_argument(
4052 '--initial-dashboard-password',
4053 help='Initial password for the initial dashboard user')
4054
4055 parser_bootstrap.add_argument(
4056 '--dashboard-key',
4057 help='Dashboard key')
4058 parser_bootstrap.add_argument(
4059 '--dashboard-crt',
4060 help='Dashboard certificate')
4061
4062 parser_bootstrap.add_argument(
4063 '--skip-mon-network',
4064 action='store_true',
4065 help='set mon public_network based on bootstrap mon ip')
4066 parser_bootstrap.add_argument(
4067 '--skip-dashboard',
4068 action='store_true',
4069 help='do not enable the Ceph Dashboard')
4070 parser_bootstrap.add_argument(
4071 '--dashboard-password-noupdate',
4072 action='store_true',
4073 help='stop forced dashboard password change')
4074 parser_bootstrap.add_argument(
4075 '--no-minimize-config',
4076 action='store_true',
4077 help='do not assimilate and minimize the config file')
4078 parser_bootstrap.add_argument(
4079 '--skip-ping-check',
4080 action='store_true',
4081 help='do not verify that mon IP is pingable')
4082 parser_bootstrap.add_argument(
4083 '--skip-pull',
4084 action='store_true',
4085 help='do not pull the latest image before bootstrapping')
4086 parser_bootstrap.add_argument(
4087 '--skip-firewalld',
4088 action='store_true',
4089 help='Do not configure firewalld')
4090 parser_bootstrap.add_argument(
4091 '--allow-overwrite',
4092 action='store_true',
4093 help='allow overwrite of existing --output-* config/keyring/ssh files')
4094 parser_bootstrap.add_argument(
4095 '--allow-fqdn-hostname',
4096 action='store_true',
4097 help='allow hostname that is fully-qualified (contains ".")')
4098 parser_bootstrap.add_argument(
4099 '--skip-prepare-host',
4100 action='store_true',
4101 help='Do not prepare host')
4102 parser_bootstrap.add_argument(
4103 '--orphan-initial-daemons',
4104 action='store_true',
4105 help='Do not create initial mon, mgr, and crash service specs')
4106 parser_bootstrap.add_argument(
4107 '--skip-monitoring-stack',
4108 action='store_true',
4109 help='Do not automatically provision monitoring stack (prometheus, grafana, alertmanager, node-exporter)')
4110
4111 parser_deploy = subparsers.add_parser(
4112 'deploy', help='deploy a daemon')
4113 parser_deploy.set_defaults(func=command_deploy)
4114 parser_deploy.add_argument(
4115 '--name',
4116 required=True,
4117 action=CustomValidation,
4118 help='daemon name (type.id)')
4119 parser_deploy.add_argument(
4120 '--fsid',
4121 required=True,
4122 help='cluster FSID')
4123 parser_deploy.add_argument(
4124 '--config', '-c',
4125 help='config file for new daemon')
4126 parser_deploy.add_argument(
4127 '--config-json',
4128 help='Additional configuration information in JSON format')
4129 parser_deploy.add_argument(
4130 '--keyring',
4131 help='keyring for new daemon')
4132 parser_deploy.add_argument(
4133 '--key',
4134 help='key for new daemon')
4135 parser_deploy.add_argument(
4136 '--osd-fsid',
4137 help='OSD uuid, if creating an OSD container')
4138 parser_deploy.add_argument(
4139 '--skip-firewalld',
4140 action='store_true',
4141 help='Do not configure firewalld')
4142 parser_deploy.add_argument(
4143 '--reconfig',
4144 action='store_true',
4145 help='Reconfigure a previously deployed daemon')
4146 parser_deploy.add_argument(
4147 '--allow-ptrace',
4148 action='store_true',
4149 help='Allow SYS_PTRACE on daemon container')
4150
4151 parser_check_host = subparsers.add_parser(
4152 'check-host', help='check host configuration')
4153 parser_check_host.set_defaults(func=command_check_host)
4154 parser_check_host.add_argument(
4155 '--expect-hostname',
4156 help='Check that hostname matches an expected value')
4157
4158 parser_prepare_host = subparsers.add_parser(
4159 'prepare-host', help='prepare a host for cephadm use')
4160 parser_prepare_host.set_defaults(func=command_prepare_host)
4161 parser_prepare_host.add_argument(
4162 '--expect-hostname',
4163 help='Set hostname')
4164
4165 parser_add_repo = subparsers.add_parser(
4166 'add-repo', help='configure package repository')
4167 parser_add_repo.set_defaults(func=command_add_repo)
4168 parser_add_repo.add_argument(
4169 '--release',
4170 help='use latest version of a named release (e.g., octopus)')
4171 parser_add_repo.add_argument(
4172 '--version',
4173 help='use specific upstream version (x.y.z)')
4174 parser_add_repo.add_argument(
4175 '--dev',
4176 help='use specified bleeding edge build from git branch or tag')
4177 parser_add_repo.add_argument(
4178 '--dev-commit',
4179 help='use specified bleeding edge build from git commit')
4180 parser_add_repo.add_argument(
4181 '--gpg-url',
4182 help='specify alternative GPG key location')
4183 parser_add_repo.add_argument(
4184 '--repo-url',
4185 default='https://download.ceph.com',
4186 help='specify alternative repo location')
4187 # TODO: proxy?
4188
4189 parser_rm_repo = subparsers.add_parser(
4190 'rm-repo', help='remove package repository configuration')
4191 parser_rm_repo.set_defaults(func=command_rm_repo)
4192
4193 parser_install = subparsers.add_parser(
4194 'install', help='install ceph package(s)')
4195 parser_install.set_defaults(func=command_install)
4196 parser_install.add_argument(
4197 'packages', nargs='*',
4198 default=['cephadm'],
4199 help='packages')
4200
4201 return parser
4202
4203 def _parse_args(av):
4204 parser = _get_parser()
4205 return parser.parse_args(av)
4206
4207 if __name__ == "__main__":
4208 # allow argv to be injected
4209 try:
4210 av = injected_argv # type: ignore
4211 except NameError:
4212 av = sys.argv[1:]
4213 args = _parse_args(av)
4214
4215 if args.verbose:
4216 logging.basicConfig(level=logging.DEBUG)
4217 else:
4218 logging.basicConfig(level=logging.INFO)
4219 logger = logging.getLogger('cephadm')
4220
4221 # root?
4222 if os.geteuid() != 0:
4223 sys.stderr.write('ERROR: cephadm should be run as root\n')
4224 sys.exit(1)
4225
4226 # podman or docker?
4227 if args.docker:
4228 container_path = find_program('docker')
4229 else:
4230 for i in CONTAINER_PREFERENCE:
4231 try:
4232 container_path = find_program(i)
4233 break
4234 except Exception as e:
4235 logger.debug('Could not locate %s: %s' % (i, e))
4236 if not container_path and args.func != command_prepare_host:
4237 sys.stderr.write('Unable to locate any of %s\n' % CONTAINER_PREFERENCE)
4238 sys.exit(1)
4239
4240 if 'func' not in args:
4241 sys.stderr.write('No command specified; pass -h or --help for usage\n')
4242 sys.exit(1)
4243
4244 try:
4245 r = args.func()
4246 except Error as e:
4247 if args.verbose:
4248 raise
4249 sys.stderr.write('ERROR: %s\n' % e)
4250 sys.exit(1)
4251 if not r:
4252 r = 0
4253 sys.exit(r)