[ceph.git] / ceph / src / pybind / mgr / cephadm / migrations.py

import json
import re
import logging
from typing import TYPE_CHECKING, Iterator, Optional, Dict, Any, List

from ceph.deployment.service_spec import PlacementSpec, ServiceSpec, HostPlacementSpec, RGWSpec
from cephadm.schedule import HostAssignment
import rados

from mgr_module import NFS_POOL_NAME
from orchestrator import OrchestratorError, DaemonDescription

if TYPE_CHECKING:
    from .module import CephadmOrchestrator

LAST_MIGRATION = 6

logger = logging.getLogger(__name__)


class Migrations:
    def __init__(self, mgr: "CephadmOrchestrator"):
        self.mgr = mgr

        # Why having a global counter, instead of spec versions?
        #
        # for the first migration:
        # The specs don't change in (this) migration. but the scheduler here.
        # Adding the version to the specs at this time just felt wrong to me.
        #
        # And the specs are only another part of cephadm which needs potential upgrades.
        # We have the cache, the inventory, the config store, the upgrade (imagine changing the
        # upgrade code, while an old upgrade is still in progress), naming of daemons,
        # fs-layout of the daemons, etc.
        self.set_sane_migration_current()

        v = mgr.get_store('nfs_migration_queue')
        self.nfs_migration_queue = json.loads(v) if v else []

        r = mgr.get_store('rgw_migration_queue')
        self.rgw_migration_queue = json.loads(r) if r else []

        # for some migrations, we don't need to do anything except for
        # incrementing migration_current.
        # let's try to shortcut things here.
        self.migrate(True)

    def set(self, val: int) -> None:
        self.mgr.set_module_option('migration_current', val)
        self.mgr.migration_current = val

    def set_sane_migration_current(self) -> None:
        # migration current should always be an integer
        # between 0 and LAST_MIGRATION (inclusive) in order to
        # actually carry out migration. If we find
        # it is None or too high of a value here we should
        # set it to some sane value
        mc: Optional[int] = self.mgr.migration_current
        if mc is None:
            logger.info('Found migration_current of "None". Setting to last migration.')
            self.set(LAST_MIGRATION)
            return

        if mc > LAST_MIGRATION:
            logger.error(f'Found migration_current of {mc} when max should be {LAST_MIGRATION}. Setting back to 0.')
            # something has gone wrong and caused migration_current
            # to be higher than it should be able to be. Best option
            # we have here is to just set it back to 0
            self.set(0)

    def is_migration_ongoing(self) -> bool:
        self.set_sane_migration_current()
        mc: Optional[int] = self.mgr.migration_current
        return mc is None or mc < LAST_MIGRATION

    def verify_no_migration(self) -> None:
        if self.is_migration_ongoing():
            # this is raised in module.serve()
            raise OrchestratorError(
                "cephadm migration still ongoing. Please wait, until the migration is complete.")

    def migrate(self, startup: bool = False) -> None:
        if self.mgr.migration_current == 0:
            if self.migrate_0_1():
                self.set(1)

        if self.mgr.migration_current == 1:
            if self.migrate_1_2():
                self.set(2)

        if self.mgr.migration_current == 2 and not startup:
            if self.migrate_2_3():
                self.set(3)

        if self.mgr.migration_current == 3:
            if self.migrate_3_4():
                self.set(4)

        if self.mgr.migration_current == 4:
            if self.migrate_4_5():
                self.set(5)

        if self.mgr.migration_current == 5:
            if self.migrate_5_6():
                self.set(6)

    def migrate_0_1(self) -> bool:
        """
        Migration 0 -> 1
        New scheduler that takes PlacementSpec as the bound and not as recommendation.
        I.e. the new scheduler won't suggest any new placements outside of the hosts
        specified by label etc.

        Which means, we have to make sure, we're not removing any daemons directly after
        upgrading to the new scheduler.

        There is a potential race here:
        1. user updates his spec to remove daemons
        2. mgr gets upgraded to new scheduler, before the old scheduler removed the daemon
        3. now, we're converting the spec to explicit placement, thus reverting (1.)
        I think this is ok.
        """

        def interesting_specs() -> Iterator[ServiceSpec]:
            for s in self.mgr.spec_store.all_specs.values():
                if s.unmanaged:
                    continue
                p = s.placement
                if p is None:
                    continue
                if p.count is None:
                    continue
                if not p.hosts and not p.host_pattern and not p.label:
                    continue
                yield s

        def convert_to_explicit(spec: ServiceSpec) -> None:
            existing_daemons = self.mgr.cache.get_daemons_by_service(spec.service_name())
            placements, to_add, to_remove = HostAssignment(
                spec=spec,
                hosts=self.mgr.inventory.all_specs(),
                unreachable_hosts=self.mgr.cache.get_unreachable_hosts(),
                draining_hosts=self.mgr.cache.get_draining_hosts(),
                daemons=existing_daemons,
            ).place()

            # We have to migrate, only if the new scheduler would remove daemons
            if len(placements) >= len(existing_daemons):
                return

            def to_hostname(d: DaemonDescription) -> HostPlacementSpec:
                if d.hostname in old_hosts:
                    return old_hosts[d.hostname]
                else:
                    assert d.hostname
                    return HostPlacementSpec(d.hostname, '', '')

            old_hosts = {h.hostname: h for h in spec.placement.hosts}
            new_hosts = [to_hostname(d) for d in existing_daemons]

            new_placement = PlacementSpec(
                hosts=new_hosts,
                count=spec.placement.count
            )

            new_spec = ServiceSpec.from_json(spec.to_json())
            new_spec.placement = new_placement

            logger.info(f"Migrating {spec.one_line_str()} to explicit placement")

            self.mgr.spec_store.save(new_spec)

        specs = list(interesting_specs())
        if not specs:
            return True  # nothing to do. shortcut

        if not self.mgr.cache.daemon_cache_filled():
            logger.info("Unable to migrate yet. Daemon Cache still incomplete.")
            return False

        for spec in specs:
            convert_to_explicit(spec)

        return True

    def migrate_1_2(self) -> bool:
        """
        After 15.2.4, we unified some service IDs: MONs, MGRs etc no longer have a service id.
        Which means, the service names changed:

        mon.foo -> mon
        mgr.foo -> mgr

        This fixes the data structure consistency
        """
        bad_specs = {}
        for name, spec in self.mgr.spec_store.all_specs.items():
            if name != spec.service_name():
                bad_specs[name] = (spec.service_name(), spec)

        for old, (new, old_spec) in bad_specs.items():
            if new not in self.mgr.spec_store.all_specs:
                spec = old_spec
            else:
                spec = self.mgr.spec_store.all_specs[new]
            spec.unmanaged = True
            self.mgr.spec_store.save(spec)
            self.mgr.spec_store.finally_rm(old)

        return True

    def migrate_2_3(self) -> bool:
        if self.nfs_migration_queue:
            from nfs.cluster import create_ganesha_pool

            create_ganesha_pool(self.mgr)
            for service_id, pool, ns in self.nfs_migration_queue:
                if pool != '.nfs':
                    self.migrate_nfs_spec(service_id, pool, ns)
            self.nfs_migration_queue = []
            self.mgr.log.info('Done migrating all NFS services')
        return True

    def migrate_nfs_spec(self, service_id: str, pool: str, ns: Optional[str]) -> None:
        renamed = False
        if service_id.startswith('ganesha-'):
            service_id = service_id[8:]
            renamed = True

        self.mgr.log.info(
            f'Migrating nfs.{service_id} from legacy pool {pool} namespace {ns}'
        )

        # read exports
        ioctx = self.mgr.rados.open_ioctx(pool)
        if ns is not None:
            ioctx.set_namespace(ns)
        object_iterator = ioctx.list_objects()
        exports = []
        while True:
            try:
                obj = object_iterator.__next__()
                if obj.key.startswith('export-'):
                    self.mgr.log.debug(f'reading {obj.key}')
                    exports.append(obj.read().decode())
            except StopIteration:
                break
        self.mgr.log.info(f'Found {len(exports)} exports for legacy nfs.{service_id}')

        # copy grace file
        if service_id != ns:
            try:
                grace = ioctx.read("grace")
                new_ioctx = self.mgr.rados.open_ioctx(NFS_POOL_NAME)
                new_ioctx.set_namespace(service_id)
                new_ioctx.write_full("grace", grace)
                self.mgr.log.info('Migrated nfs-ganesha grace file')
            except rados.ObjectNotFound:
                self.mgr.log.debug('failed to read old grace file; skipping')

        if renamed and f'nfs.ganesha-{service_id}' in self.mgr.spec_store:
            # rename from nfs.ganesha-* to nfs.*.  This will destroy old daemons and
            # deploy new ones.
            self.mgr.log.info(f'Replacing nfs.ganesha-{service_id} with nfs.{service_id}')
            spec = self.mgr.spec_store[f'nfs.ganesha-{service_id}'].spec
            self.mgr.spec_store.rm(f'nfs.ganesha-{service_id}')
            spec.service_id = service_id
            self.mgr.spec_store.save(spec, True)

            # We have to remove the old daemons here as well, otherwise we'll end up with a port conflict.
            daemons = [d.name()
                       for d in self.mgr.cache.get_daemons_by_service(f'nfs.ganesha-{service_id}')]
            self.mgr.log.info(f'Removing old nfs.ganesha-{service_id} daemons {daemons}')
            self.mgr.remove_daemons(daemons)
        else:
            # redeploy all ganesha daemons to ensures that the daemon
            # cephx are correct AND container configs are set up properly
            daemons = [d.name() for d in self.mgr.cache.get_daemons_by_service(f'nfs.{service_id}')]
            self.mgr.log.info(f'Removing old nfs.{service_id} daemons {daemons}')
            self.mgr.remove_daemons(daemons)

            # re-save service spec (without pool and namespace properties!)
            spec = self.mgr.spec_store[f'nfs.{service_id}'].spec
            self.mgr.spec_store.save(spec)

        # import exports
        for export in exports:
            ex = ''
            for line in export.splitlines():
                if (
                        line.startswith('        secret_access_key =')
                        or line.startswith('        user_id =')
                ):
                    continue
                ex += line + '\n'
            self.mgr.log.debug(f'importing export: {ex}')
            ret, out, err = self.mgr.mon_command({
                'prefix': 'nfs export apply',
                'cluster_id': service_id
            }, inbuf=ex)
            if ret:
                self.mgr.log.warning(f'Failed to migrate export ({ret}): {err}\nExport was:\n{ex}')
        self.mgr.log.info(f'Done migrating nfs.{service_id}')

    def migrate_3_4(self) -> bool:
        # We can't set any host with the _admin label, but we're
        # going to warn when calling `ceph orch host rm...`
        if 'client.admin' not in self.mgr.keys.keys:
            self.mgr._client_keyring_set(
                entity='client.admin',
                placement='label:_admin',
            )
        return True

    def migrate_4_5(self) -> bool:
        registry_url = self.mgr.get_module_option('registry_url')
        registry_username = self.mgr.get_module_option('registry_username')
        registry_password = self.mgr.get_module_option('registry_password')
        if registry_url and registry_username and registry_password:

            registry_credentials = {'url': registry_url,
                                    'username': registry_username, 'password': registry_password}
            self.mgr.set_store('registry_credentials', json.dumps(registry_credentials))

            self.mgr.set_module_option('registry_url', None)
            self.mgr.check_mon_command({
                'prefix': 'config rm',
                'who': 'mgr',
                'key': 'mgr/cephadm/registry_url',
            })
            self.mgr.set_module_option('registry_username', None)
            self.mgr.check_mon_command({
                'prefix': 'config rm',
                'who': 'mgr',
                'key': 'mgr/cephadm/registry_username',
            })
            self.mgr.set_module_option('registry_password', None)
            self.mgr.check_mon_command({
                'prefix': 'config rm',
                'who': 'mgr',
                'key': 'mgr/cephadm/registry_password',
            })

            self.mgr.log.info('Done migrating registry login info')
        return True

    def migrate_rgw_spec(self, spec: Dict[Any, Any]) -> Optional[RGWSpec]:
        """ Migrate an old rgw spec to the new format."""
        new_spec = spec.copy()
        field_content: List[str] = re.split(' +', new_spec['spec']['rgw_frontend_type'])
        valid_spec = False
        if 'beast' in field_content:
            new_spec['spec']['rgw_frontend_type'] = 'beast'
            field_content.remove('beast')
            valid_spec = True
        elif 'civetweb' in field_content:
            new_spec['spec']['rgw_frontend_type'] = 'civetweb'
            field_content.remove('civetweb')
            valid_spec = True
        else:
            # Error: Should not happen as that would be an invalid RGW spec. In that case
            # we keep the spec as it, mark it as unmanaged to avoid the daemons being deleted
            # and raise a health warning so the user can fix the issue manually later.
            self.mgr.log.error("Cannot migrate RGW spec, bad rgw_frontend_type value: {spec['spec']['rgw_frontend_type']}.")

        if valid_spec:
            new_spec['spec']['rgw_frontend_extra_args'] = []
            new_spec['spec']['rgw_frontend_extra_args'].extend(field_content)

        return RGWSpec.from_json(new_spec)

    def rgw_spec_needs_migration(self, spec: Dict[Any, Any]) -> bool:
        if 'spec' not in spec:
            # if users allowed cephadm to set up most of the
            # attributes, it's possible there is no "spec" section
            # inside the spec. In that case, no migration is needed
            return False
        return 'rgw_frontend_type' in spec['spec'] \
            and spec['spec']['rgw_frontend_type'] is not None \
            and spec['spec']['rgw_frontend_type'].strip() not in ['beast', 'civetweb']

    def migrate_5_6(self) -> bool:
        """
        Migration 5 -> 6

        Old RGW spec used to allow 'bad' values on the rgw_frontend_type field. For example
        the following value used to be valid:

          rgw_frontend_type: "beast endpoint=10.16.96.54:8043 tcp_nodelay=1"

        As of 17.2.6 release, these kind of entries are not valid anymore and a more strict check
        has been added to validate this field.

        This migration logic detects this 'bad' values and tries to transform them to the new
        valid format where rgw_frontend_type field can only be either 'beast' or 'civetweb'.
        Any extra arguments detected on rgw_frontend_type field will be parsed and passed in the
        new spec field rgw_frontend_extra_args.
        """
        self.mgr.log.debug(f'Starting rgw migration (queue length is {len(self.rgw_migration_queue)})')
        for s in self.rgw_migration_queue:
            spec = s['spec']
            if self.rgw_spec_needs_migration(spec):
                rgw_spec = self.migrate_rgw_spec(spec)
                if rgw_spec is not None:
                    logger.info(f"Migrating {spec} to new RGW with extra args format {rgw_spec}")
                    self.mgr.spec_store.save(rgw_spec)
            else:
                logger.info(f"No Migration is needed for rgw spec: {spec}")
        self.rgw_migration_queue = []
        return True


def queue_migrate_rgw_spec(mgr: "CephadmOrchestrator", spec_dict: Dict[Any, Any]) -> None:
    """
    As aprt of 17.2.6 a stricter RGW spec validation has been added so the field
    rgw_frontend_type cannot be used to pass rgw-frontends parameters.
    """
    service_id = spec_dict['spec']['service_id']
    queued = mgr.get_store('rgw_migration_queue') or '[]'
    ls = json.loads(queued)
    ls.append(spec_dict)
    mgr.set_store('rgw_migration_queue', json.dumps(ls))
    mgr.log.info(f'Queued rgw.{service_id} for migration')


def queue_migrate_nfs_spec(mgr: "CephadmOrchestrator", spec_dict: Dict[Any, Any]) -> None:
    """
    After 16.2.5 we dropped the NFSServiceSpec pool and namespace properties.
    Queue up a migration to process later, once we are sure that RADOS is available
    and so on.
    """
    service_id = spec_dict['spec']['service_id']
    args = spec_dict['spec'].get('spec', {})
    pool = args.pop('pool', 'nfs-ganesha')
    ns = args.pop('namespace', service_id)
    queued = mgr.get_store('nfs_migration_queue') or '[]'
    ls = json.loads(queued)
    ls.append([service_id, pool, ns])
    mgr.set_store('nfs_migration_queue', json.dumps(ls))
    mgr.log.info(f'Queued nfs.{service_id} for migration')
Commit	Line	Data
a4b75251	1	import json
1e59de90	2	import re
f6b5b4d7	3	import logging
1e59de90	4	from typing import TYPE_CHECKING, Iterator, Optional, Dict, Any, List
f6b5b4d7	5
1e59de90	6	from ceph.deployment.service_spec import PlacementSpec, ServiceSpec, HostPlacementSpec, RGWSpec
f6b5b4d7	7	from cephadm.schedule import HostAssignment
a4b75251	8	import rados
f6b5b4d7	9
a4b75251	10	from mgr_module import NFS_POOL_NAME
f67539c2	11	from orchestrator import OrchestratorError, DaemonDescription
f6b5b4d7 TL	12
	13	if TYPE_CHECKING:
	14	from .module import CephadmOrchestrator
	15
1e59de90	16	LAST_MIGRATION = 6
f6b5b4d7 TL	17
	18	logger = logging.getLogger(__name__)
	19
	20
	21	class Migrations:
	22	def __init__(self, mgr: "CephadmOrchestrator"):
	23	self.mgr = mgr
	24
	25	# Why having a global counter, instead of spec versions?
	26	#
	27	# for the first migration:
	28	# The specs don't change in (this) migration. but the scheduler here.
	29	# Adding the version to the specs at this time just felt wrong to me.
	30	#
	31	# And the specs are only another part of cephadm which needs potential upgrades.
	32	# We have the cache, the inventory, the config store, the upgrade (imagine changing the
	33	# upgrade code, while an old upgrade is still in progress), naming of daemons,
	34	# fs-layout of the daemons, etc.
39ae355f	35	self.set_sane_migration_current()
a4b75251 TL	36
	37	v = mgr.get_store('nfs_migration_queue')
	38	self.nfs_migration_queue = json.loads(v) if v else []
f6b5b4d7	39
1e59de90 TL	40	r = mgr.get_store('rgw_migration_queue')
	41	self.rgw_migration_queue = json.loads(r) if r else []
	42
f6b5b4d7	43	# for some migrations, we don't need to do anything except for
20effc67	44	# incrementing migration_current.
f6b5b4d7	45	# let's try to shortcut things here.
a4b75251	46	self.migrate(True)
f6b5b4d7	47
adb31ebb	48	def set(self, val: int) -> None:
f6b5b4d7 TL	49	self.mgr.set_module_option('migration_current', val)
	50	self.mgr.migration_current = val
	51
39ae355f TL	52	def set_sane_migration_current(self) -> None:
	53	# migration current should always be an integer
	54	# between 0 and LAST_MIGRATION (inclusive) in order to
	55	# actually carry out migration. If we find
	56	# it is None or too high of a value here we should
	57	# set it to some sane value
	58	mc: Optional[int] = self.mgr.migration_current
	59	if mc is None:
	60	logger.info('Found migration_current of "None". Setting to last migration.')
	61	self.set(LAST_MIGRATION)
	62	return
	63
	64	if mc > LAST_MIGRATION:
	65	logger.error(f'Found migration_current of {mc} when max should be {LAST_MIGRATION}. Setting back to 0.')
	66	# something has gone wrong and caused migration_current
	67	# to be higher than it should be able to be. Best option
	68	# we have here is to just set it back to 0
	69	self.set(0)
	70
adb31ebb	71	def is_migration_ongoing(self) -> bool:
39ae355f TL	72	self.set_sane_migration_current()
	73	mc: Optional[int] = self.mgr.migration_current
	74	return mc is None or mc < LAST_MIGRATION
f6b5b4d7	75
adb31ebb	76	def verify_no_migration(self) -> None:
f6b5b4d7 TL	77	if self.is_migration_ongoing():
	78	# this is raised in module.serve()
	79	raise OrchestratorError(
	80	"cephadm migration still ongoing. Please wait, until the migration is complete.")
	81
a4b75251	82	def migrate(self, startup: bool = False) -> None:
f6b5b4d7 TL	83	if self.mgr.migration_current == 0:
	84	if self.migrate_0_1():
	85	self.set(1)
	86
	87	if self.mgr.migration_current == 1:
	88	if self.migrate_1_2():
	89	self.set(2)
	90
a4b75251 TL	91	if self.mgr.migration_current == 2 and not startup:
	92	if self.migrate_2_3():
	93	self.set(3)
	94
20effc67 TL	95	if self.mgr.migration_current == 3:
	96	if self.migrate_3_4():
	97	self.set(4)
	98
	99	if self.mgr.migration_current == 4:
	100	if self.migrate_4_5():
	101	self.set(5)
	102
1e59de90 TL	103	if self.mgr.migration_current == 5:
	104	if self.migrate_5_6():
	105	self.set(6)
	106
f6b5b4d7 TL	107	def migrate_0_1(self) -> bool:
	108	"""
	109	Migration 0 -> 1
	110	New scheduler that takes PlacementSpec as the bound and not as recommendation.
	111	I.e. the new scheduler won't suggest any new placements outside of the hosts
	112	specified by label etc.
	113
	114	Which means, we have to make sure, we're not removing any daemons directly after
	115	upgrading to the new scheduler.
	116
	117	There is a potential race here:
	118	1. user updates his spec to remove daemons
	119	2. mgr gets upgraded to new scheduler, before the old scheduler removed the daemon
	120	3. now, we're converting the spec to explicit placement, thus reverting (1.)
	121	I think this is ok.
	122	"""
	123
	124	def interesting_specs() -> Iterator[ServiceSpec]:
f67539c2	125	for s in self.mgr.spec_store.all_specs.values():
f6b5b4d7 TL	126	if s.unmanaged:
	127	continue
	128	p = s.placement
	129	if p is None:
	130	continue
	131	if p.count is None:
	132	continue
	133	if not p.hosts and not p.host_pattern and not p.label:
	134	continue
	135	yield s
	136
	137	def convert_to_explicit(spec: ServiceSpec) -> None:
f67539c2 TL	138	existing_daemons = self.mgr.cache.get_daemons_by_service(spec.service_name())
f67539c2 TL	139	placements, to_add, to_remove = HostAssignment(
f6b5b4d7	140	spec=spec,
f91f0fd5	141	hosts=self.mgr.inventory.all_specs(),
20effc67	142	unreachable_hosts=self.mgr.cache.get_unreachable_hosts(),
2a845540	143	draining_hosts=self.mgr.cache.get_draining_hosts(),
f67539c2	144	daemons=existing_daemons,
f6b5b4d7 TL	145	).place()
f6b5b4d7 TL	146
f6b5b4d7 TL	147	# We have to migrate, only if the new scheduler would remove daemons
	148	if len(placements) >= len(existing_daemons):
	149	return
	150
f67539c2 TL	151	def to_hostname(d: DaemonDescription) -> HostPlacementSpec:
	152	if d.hostname in old_hosts:
	153	return old_hosts[d.hostname]
	154	else:
	155	assert d.hostname
	156	return HostPlacementSpec(d.hostname, '', '')
	157
f6b5b4d7	158	old_hosts = {h.hostname: h for h in spec.placement.hosts}
f67539c2	159	new_hosts = [to_hostname(d) for d in existing_daemons]
f6b5b4d7 TL	160
	161	new_placement = PlacementSpec(
	162	hosts=new_hosts,
	163	count=spec.placement.count
	164	)
	165
	166	new_spec = ServiceSpec.from_json(spec.to_json())
	167	new_spec.placement = new_placement
	168
	169	logger.info(f"Migrating {spec.one_line_str()} to explicit placement")
	170
	171	self.mgr.spec_store.save(new_spec)
	172
	173	specs = list(interesting_specs())
	174	if not specs:
	175	return True # nothing to do. shortcut
	176
	177	if not self.mgr.cache.daemon_cache_filled():
	178	logger.info("Unable to migrate yet. Daemon Cache still incomplete.")
	179	return False
	180
	181	for spec in specs:
	182	convert_to_explicit(spec)
	183
	184	return True
	185
	186	def migrate_1_2(self) -> bool:
	187	"""
	188	After 15.2.4, we unified some service IDs: MONs, MGRs etc no longer have a service id.
	189	Which means, the service names changed:
	190
	191	mon.foo -> mon
	192	mgr.foo -> mgr
	193
	194	This fixes the data structure consistency
	195	"""
	196	bad_specs = {}
f67539c2	197	for name, spec in self.mgr.spec_store.all_specs.items():
f6b5b4d7 TL	198	if name != spec.service_name():
	199	bad_specs[name] = (spec.service_name(), spec)
	200
	201	for old, (new, old_spec) in bad_specs.items():
f67539c2	202	if new not in self.mgr.spec_store.all_specs:
f6b5b4d7 TL	203	spec = old_spec
f6b5b4d7 TL	204	else:
f67539c2	205	spec = self.mgr.spec_store.all_specs[new]
f6b5b4d7 TL	206	spec.unmanaged = True
f6b5b4d7 TL	207	self.mgr.spec_store.save(spec)
f67539c2	208	self.mgr.spec_store.finally_rm(old)
f6b5b4d7 TL	209
f6b5b4d7 TL	210	return True
a4b75251 TL	211
	212	def migrate_2_3(self) -> bool:
	213	if self.nfs_migration_queue:
	214	from nfs.cluster import create_ganesha_pool
	215
	216	create_ganesha_pool(self.mgr)
	217	for service_id, pool, ns in self.nfs_migration_queue:
	218	if pool != '.nfs':
	219	self.migrate_nfs_spec(service_id, pool, ns)
	220	self.nfs_migration_queue = []
	221	self.mgr.log.info('Done migrating all NFS services')
	222	return True
	223
	224	def migrate_nfs_spec(self, service_id: str, pool: str, ns: Optional[str]) -> None:
	225	renamed = False
	226	if service_id.startswith('ganesha-'):
	227	service_id = service_id[8:]
	228	renamed = True
	229
	230	self.mgr.log.info(
	231	f'Migrating nfs.{service_id} from legacy pool {pool} namespace {ns}'
	232	)
	233
	234	# read exports
	235	ioctx = self.mgr.rados.open_ioctx(pool)
	236	if ns is not None:
	237	ioctx.set_namespace(ns)
	238	object_iterator = ioctx.list_objects()
	239	exports = []
	240	while True:
	241	try:
	242	obj = object_iterator.__next__()
	243	if obj.key.startswith('export-'):
	244	self.mgr.log.debug(f'reading {obj.key}')
	245	exports.append(obj.read().decode())
	246	except StopIteration:
	247	break
	248	self.mgr.log.info(f'Found {len(exports)} exports for legacy nfs.{service_id}')
	249
	250	# copy grace file
	251	if service_id != ns:
	252	try:
	253	grace = ioctx.read("grace")
	254	new_ioctx = self.mgr.rados.open_ioctx(NFS_POOL_NAME)
	255	new_ioctx.set_namespace(service_id)
	256	new_ioctx.write_full("grace", grace)
	257	self.mgr.log.info('Migrated nfs-ganesha grace file')
	258	except rados.ObjectNotFound:
	259	self.mgr.log.debug('failed to read old grace file; skipping')
	260
	261	if renamed and f'nfs.ganesha-{service_id}' in self.mgr.spec_store:
	262	# rename from nfs.ganesha-* to nfs.*. This will destroy old daemons and
	263	# deploy new ones.
	264	self.mgr.log.info(f'Replacing nfs.ganesha-{service_id} with nfs.{service_id}')
	265	spec = self.mgr.spec_store[f'nfs.ganesha-{service_id}'].spec
	266	self.mgr.spec_store.rm(f'nfs.ganesha-{service_id}')
	267	spec.service_id = service_id
	268	self.mgr.spec_store.save(spec, True)
20effc67 TL	269
	270	# We have to remove the old daemons here as well, otherwise we'll end up with a port conflict.
	271	daemons = [d.name()
	272	for d in self.mgr.cache.get_daemons_by_service(f'nfs.ganesha-{service_id}')]
	273	self.mgr.log.info(f'Removing old nfs.ganesha-{service_id} daemons {daemons}')
	274	self.mgr.remove_daemons(daemons)
a4b75251 TL	275	else:
	276	# redeploy all ganesha daemons to ensures that the daemon
	277	# cephx are correct AND container configs are set up properly
	278	daemons = [d.name() for d in self.mgr.cache.get_daemons_by_service(f'nfs.{service_id}')]
	279	self.mgr.log.info(f'Removing old nfs.{service_id} daemons {daemons}')
	280	self.mgr.remove_daemons(daemons)
	281
	282	# re-save service spec (without pool and namespace properties!)
	283	spec = self.mgr.spec_store[f'nfs.{service_id}'].spec
	284	self.mgr.spec_store.save(spec)
	285
	286	# import exports
	287	for export in exports:
	288	ex = ''
	289	for line in export.splitlines():
	290	if (
	291	line.startswith(' secret_access_key =')
	292	or line.startswith(' user_id =')
	293	):
	294	continue
	295	ex += line + '\n'
	296	self.mgr.log.debug(f'importing export: {ex}')
	297	ret, out, err = self.mgr.mon_command({
	298	'prefix': 'nfs export apply',
	299	'cluster_id': service_id
	300	}, inbuf=ex)
	301	if ret:
	302	self.mgr.log.warning(f'Failed to migrate export ({ret}): {err}\nExport was:\n{ex}')
	303	self.mgr.log.info(f'Done migrating nfs.{service_id}')
	304
20effc67 TL	305	def migrate_3_4(self) -> bool:
	306	# We can't set any host with the _admin label, but we're
	307	# going to warn when calling `ceph orch host rm...`
	308	if 'client.admin' not in self.mgr.keys.keys:
	309	self.mgr._client_keyring_set(
	310	entity='client.admin',
	311	placement='label:_admin',
	312	)
	313	return True
	314
	315	def migrate_4_5(self) -> bool:
	316	registry_url = self.mgr.get_module_option('registry_url')
	317	registry_username = self.mgr.get_module_option('registry_username')
	318	registry_password = self.mgr.get_module_option('registry_password')
	319	if registry_url and registry_username and registry_password:
	320
	321	registry_credentials = {'url': registry_url,
	322	'username': registry_username, 'password': registry_password}
	323	self.mgr.set_store('registry_credentials', json.dumps(registry_credentials))
	324
	325	self.mgr.set_module_option('registry_url', None)
	326	self.mgr.check_mon_command({
	327	'prefix': 'config rm',
	328	'who': 'mgr',
	329	'key': 'mgr/cephadm/registry_url',
	330	})
	331	self.mgr.set_module_option('registry_username', None)
	332	self.mgr.check_mon_command({
	333	'prefix': 'config rm',
	334	'who': 'mgr',
	335	'key': 'mgr/cephadm/registry_username',
	336	})
	337	self.mgr.set_module_option('registry_password', None)
	338	self.mgr.check_mon_command({
	339	'prefix': 'config rm',
	340	'who': 'mgr',
	341	'key': 'mgr/cephadm/registry_password',
	342	})
	343
	344	self.mgr.log.info('Done migrating registry login info')
	345	return True
	346
1e59de90 TL	347	def migrate_rgw_spec(self, spec: Dict[Any, Any]) -> Optional[RGWSpec]:
	348	""" Migrate an old rgw spec to the new format."""
	349	new_spec = spec.copy()
	350	field_content: List[str] = re.split(' +', new_spec['spec']['rgw_frontend_type'])
	351	valid_spec = False
	352	if 'beast' in field_content:
	353	new_spec['spec']['rgw_frontend_type'] = 'beast'
	354	field_content.remove('beast')
	355	valid_spec = True
	356	elif 'civetweb' in field_content:
	357	new_spec['spec']['rgw_frontend_type'] = 'civetweb'
	358	field_content.remove('civetweb')
	359	valid_spec = True
	360	else:
	361	# Error: Should not happen as that would be an invalid RGW spec. In that case
	362	# we keep the spec as it, mark it as unmanaged to avoid the daemons being deleted
	363	# and raise a health warning so the user can fix the issue manually later.
	364	self.mgr.log.error("Cannot migrate RGW spec, bad rgw_frontend_type value: {spec['spec']['rgw_frontend_type']}.")
	365
	366	if valid_spec:
	367	new_spec['spec']['rgw_frontend_extra_args'] = []
	368	new_spec['spec']['rgw_frontend_extra_args'].extend(field_content)
	369
	370	return RGWSpec.from_json(new_spec)
	371
	372	def rgw_spec_needs_migration(self, spec: Dict[Any, Any]) -> bool:
05a536ef TL	373	if 'spec' not in spec:
	374	# if users allowed cephadm to set up most of the
	375	# attributes, it's possible there is no "spec" section
	376	# inside the spec. In that case, no migration is needed
	377	return False
1e59de90 TL	378	return 'rgw_frontend_type' in spec['spec'] \
	379	and spec['spec']['rgw_frontend_type'] is not None \
	380	and spec['spec']['rgw_frontend_type'].strip() not in ['beast', 'civetweb']
	381
	382	def migrate_5_6(self) -> bool:
	383	"""
	384	Migration 5 -> 6
	385
	386	Old RGW spec used to allow 'bad' values on the rgw_frontend_type field. For example
	387	the following value used to be valid:
	388
	389	rgw_frontend_type: "beast endpoint=10.16.96.54:8043 tcp_nodelay=1"
	390
	391	As of 17.2.6 release, these kind of entries are not valid anymore and a more strict check
	392	has been added to validate this field.
	393
	394	This migration logic detects this 'bad' values and tries to transform them to the new
	395	valid format where rgw_frontend_type field can only be either 'beast' or 'civetweb'.
	396	Any extra arguments detected on rgw_frontend_type field will be parsed and passed in the
	397	new spec field rgw_frontend_extra_args.
	398	"""
	399	self.mgr.log.debug(f'Starting rgw migration (queue length is {len(self.rgw_migration_queue)})')
	400	for s in self.rgw_migration_queue:
	401	spec = s['spec']
	402	if self.rgw_spec_needs_migration(spec):
	403	rgw_spec = self.migrate_rgw_spec(spec)
	404	if rgw_spec is not None:
	405	logger.info(f"Migrating {spec} to new RGW with extra args format {rgw_spec}")
	406	self.mgr.spec_store.save(rgw_spec)
	407	else:
	408	logger.info(f"No Migration is needed for rgw spec: {spec}")
	409	self.rgw_migration_queue = []
	410	return True
	411
	412
	413	def queue_migrate_rgw_spec(mgr: "CephadmOrchestrator", spec_dict: Dict[Any, Any]) -> None:
	414	"""
	415	As aprt of 17.2.6 a stricter RGW spec validation has been added so the field
	416	rgw_frontend_type cannot be used to pass rgw-frontends parameters.
	417	"""
	418	service_id = spec_dict['spec']['service_id']
	419	queued = mgr.get_store('rgw_migration_queue') or '[]'
	420	ls = json.loads(queued)
	421	ls.append(spec_dict)
	422	mgr.set_store('rgw_migration_queue', json.dumps(ls))
	423	mgr.log.info(f'Queued rgw.{service_id} for migration')
	424
a4b75251 TL	425
	426	def queue_migrate_nfs_spec(mgr: "CephadmOrchestrator", spec_dict: Dict[Any, Any]) -> None:
	427	"""
	428	After 16.2.5 we dropped the NFSServiceSpec pool and namespace properties.
	429	Queue up a migration to process later, once we are sure that RADOS is available
	430	and so on.
	431	"""
	432	service_id = spec_dict['spec']['service_id']
	433	args = spec_dict['spec'].get('spec', {})
	434	pool = args.pop('pool', 'nfs-ganesha')
	435	ns = args.pop('namespace', service_id)
	436	queued = mgr.get_store('nfs_migration_queue') or '[]'
	437	ls = json.loads(queued)
	438	ls.append([service_id, pool, ns])
	439	mgr.set_store('nfs_migration_queue', json.dumps(ls))
	440	mgr.log.info(f'Queued nfs.{service_id} for migration')