]>
git.proxmox.com Git - ceph.git/blob - ceph/src/pybind/mgr/cephadm/migrations.py
69f39cb9107703eb286b7ce0ee7085ace3bde420
3 from typing
import TYPE_CHECKING
, Iterator
, Optional
, Dict
, Any
5 from ceph
.deployment
.service_spec
import PlacementSpec
, ServiceSpec
, HostPlacementSpec
6 from cephadm
.schedule
import HostAssignment
9 from mgr_module
import NFS_POOL_NAME
10 from orchestrator
import OrchestratorError
, DaemonDescription
13 from .module
import CephadmOrchestrator
17 logger
= logging
.getLogger(__name__
)
21 def __init__(self
, mgr
: "CephadmOrchestrator"):
24 # Why having a global counter, instead of spec versions?
26 # for the first migration:
27 # The specs don't change in (this) migration. but the scheduler here.
28 # Adding the version to the specs at this time just felt wrong to me.
30 # And the specs are only another part of cephadm which needs potential upgrades.
31 # We have the cache, the inventory, the config store, the upgrade (imagine changing the
32 # upgrade code, while an old upgrade is still in progress), naming of daemons,
33 # fs-layout of the daemons, etc.
34 if self
.mgr
.migration_current
is None:
35 self
.set(LAST_MIGRATION
)
37 v
= mgr
.get_store('nfs_migration_queue')
38 self
.nfs_migration_queue
= json
.loads(v
) if v
else []
40 # for some migrations, we don't need to do anything except for
41 # incrementing migration_current.
42 # let's try to shortcut things here.
45 def set(self
, val
: int) -> None:
46 self
.mgr
.set_module_option('migration_current', val
)
47 self
.mgr
.migration_current
= val
49 def is_migration_ongoing(self
) -> bool:
50 return self
.mgr
.migration_current
!= LAST_MIGRATION
52 def verify_no_migration(self
) -> None:
53 if self
.is_migration_ongoing():
54 # this is raised in module.serve()
55 raise OrchestratorError(
56 "cephadm migration still ongoing. Please wait, until the migration is complete.")
58 def migrate(self
, startup
: bool = False) -> None:
59 if self
.mgr
.migration_current
== 0:
60 if self
.migrate_0_1():
63 if self
.mgr
.migration_current
== 1:
64 if self
.migrate_1_2():
67 if self
.mgr
.migration_current
== 2 and not startup
:
68 if self
.migrate_2_3():
71 if self
.mgr
.migration_current
== 3:
72 if self
.migrate_3_4():
75 if self
.mgr
.migration_current
== 4:
76 if self
.migrate_4_5():
79 def migrate_0_1(self
) -> bool:
82 New scheduler that takes PlacementSpec as the bound and not as recommendation.
83 I.e. the new scheduler won't suggest any new placements outside of the hosts
84 specified by label etc.
86 Which means, we have to make sure, we're not removing any daemons directly after
87 upgrading to the new scheduler.
89 There is a potential race here:
90 1. user updates his spec to remove daemons
91 2. mgr gets upgraded to new scheduler, before the old scheduler removed the daemon
92 3. now, we're converting the spec to explicit placement, thus reverting (1.)
96 def interesting_specs() -> Iterator
[ServiceSpec
]:
97 for s
in self
.mgr
.spec_store
.all_specs
.values():
105 if not p
.hosts
and not p
.host_pattern
and not p
.label
:
109 def convert_to_explicit(spec
: ServiceSpec
) -> None:
110 existing_daemons
= self
.mgr
.cache
.get_daemons_by_service(spec
.service_name())
111 placements
, to_add
, to_remove
= HostAssignment(
113 hosts
=self
.mgr
.inventory
.all_specs(),
114 unreachable_hosts
=self
.mgr
.cache
.get_unreachable_hosts(),
115 draining_hosts
=self
.mgr
.cache
.get_draining_hosts(),
116 daemons
=existing_daemons
,
119 # We have to migrate, only if the new scheduler would remove daemons
120 if len(placements
) >= len(existing_daemons
):
123 def to_hostname(d
: DaemonDescription
) -> HostPlacementSpec
:
124 if d
.hostname
in old_hosts
:
125 return old_hosts
[d
.hostname
]
128 return HostPlacementSpec(d
.hostname
, '', '')
130 old_hosts
= {h
.hostname
: h
for h
in spec
.placement
.hosts
}
131 new_hosts
= [to_hostname(d
) for d
in existing_daemons
]
133 new_placement
= PlacementSpec(
135 count
=spec
.placement
.count
138 new_spec
= ServiceSpec
.from_json(spec
.to_json())
139 new_spec
.placement
= new_placement
141 logger
.info(f
"Migrating {spec.one_line_str()} to explicit placement")
143 self
.mgr
.spec_store
.save(new_spec
)
145 specs
= list(interesting_specs())
147 return True # nothing to do. shortcut
149 if not self
.mgr
.cache
.daemon_cache_filled():
150 logger
.info("Unable to migrate yet. Daemon Cache still incomplete.")
154 convert_to_explicit(spec
)
158 def migrate_1_2(self
) -> bool:
160 After 15.2.4, we unified some service IDs: MONs, MGRs etc no longer have a service id.
161 Which means, the service names changed:
166 This fixes the data structure consistency
169 for name
, spec
in self
.mgr
.spec_store
.all_specs
.items():
170 if name
!= spec
.service_name():
171 bad_specs
[name
] = (spec
.service_name(), spec
)
173 for old
, (new
, old_spec
) in bad_specs
.items():
174 if new
not in self
.mgr
.spec_store
.all_specs
:
177 spec
= self
.mgr
.spec_store
.all_specs
[new
]
178 spec
.unmanaged
= True
179 self
.mgr
.spec_store
.save(spec
)
180 self
.mgr
.spec_store
.finally_rm(old
)
184 def migrate_2_3(self
) -> bool:
185 if self
.nfs_migration_queue
:
186 from nfs
.cluster
import create_ganesha_pool
188 create_ganesha_pool(self
.mgr
)
189 for service_id
, pool
, ns
in self
.nfs_migration_queue
:
191 self
.migrate_nfs_spec(service_id
, pool
, ns
)
192 self
.nfs_migration_queue
= []
193 self
.mgr
.log
.info('Done migrating all NFS services')
196 def migrate_nfs_spec(self
, service_id
: str, pool
: str, ns
: Optional
[str]) -> None:
198 if service_id
.startswith('ganesha-'):
199 service_id
= service_id
[8:]
203 f
'Migrating nfs.{service_id} from legacy pool {pool} namespace {ns}'
207 ioctx
= self
.mgr
.rados
.open_ioctx(pool
)
209 ioctx
.set_namespace(ns
)
210 object_iterator
= ioctx
.list_objects()
214 obj
= object_iterator
.__next
__()
215 if obj
.key
.startswith('export-'):
216 self
.mgr
.log
.debug(f
'reading {obj.key}')
217 exports
.append(obj
.read().decode())
218 except StopIteration:
220 self
.mgr
.log
.info(f
'Found {len(exports)} exports for legacy nfs.{service_id}')
225 grace
= ioctx
.read("grace")
226 new_ioctx
= self
.mgr
.rados
.open_ioctx(NFS_POOL_NAME
)
227 new_ioctx
.set_namespace(service_id
)
228 new_ioctx
.write_full("grace", grace
)
229 self
.mgr
.log
.info('Migrated nfs-ganesha grace file')
230 except rados
.ObjectNotFound
:
231 self
.mgr
.log
.debug('failed to read old grace file; skipping')
233 if renamed
and f
'nfs.ganesha-{service_id}' in self
.mgr
.spec_store
:
234 # rename from nfs.ganesha-* to nfs.*. This will destroy old daemons and
236 self
.mgr
.log
.info(f
'Replacing nfs.ganesha-{service_id} with nfs.{service_id}')
237 spec
= self
.mgr
.spec_store
[f
'nfs.ganesha-{service_id}'].spec
238 self
.mgr
.spec_store
.rm(f
'nfs.ganesha-{service_id}')
239 spec
.service_id
= service_id
240 self
.mgr
.spec_store
.save(spec
, True)
242 # We have to remove the old daemons here as well, otherwise we'll end up with a port conflict.
244 for d
in self
.mgr
.cache
.get_daemons_by_service(f
'nfs.ganesha-{service_id}')]
245 self
.mgr
.log
.info(f
'Removing old nfs.ganesha-{service_id} daemons {daemons}')
246 self
.mgr
.remove_daemons(daemons
)
248 # redeploy all ganesha daemons to ensures that the daemon
249 # cephx are correct AND container configs are set up properly
250 daemons
= [d
.name() for d
in self
.mgr
.cache
.get_daemons_by_service(f
'nfs.{service_id}')]
251 self
.mgr
.log
.info(f
'Removing old nfs.{service_id} daemons {daemons}')
252 self
.mgr
.remove_daemons(daemons
)
254 # re-save service spec (without pool and namespace properties!)
255 spec
= self
.mgr
.spec_store
[f
'nfs.{service_id}'].spec
256 self
.mgr
.spec_store
.save(spec
)
259 for export
in exports
:
261 for line
in export
.splitlines():
263 line
.startswith(' secret_access_key =')
264 or line
.startswith(' user_id =')
268 self
.mgr
.log
.debug(f
'importing export: {ex}')
269 ret
, out
, err
= self
.mgr
.mon_command({
270 'prefix': 'nfs export apply',
271 'cluster_id': service_id
274 self
.mgr
.log
.warning(f
'Failed to migrate export ({ret}): {err}\nExport was:\n{ex}')
275 self
.mgr
.log
.info(f
'Done migrating nfs.{service_id}')
277 def migrate_3_4(self
) -> bool:
278 # We can't set any host with the _admin label, but we're
279 # going to warn when calling `ceph orch host rm...`
280 if 'client.admin' not in self
.mgr
.keys
.keys
:
281 self
.mgr
._client
_keyring
_set
(
282 entity
='client.admin',
283 placement
='label:_admin',
287 def migrate_4_5(self
) -> bool:
288 registry_url
= self
.mgr
.get_module_option('registry_url')
289 registry_username
= self
.mgr
.get_module_option('registry_username')
290 registry_password
= self
.mgr
.get_module_option('registry_password')
291 if registry_url
and registry_username
and registry_password
:
293 registry_credentials
= {'url': registry_url
,
294 'username': registry_username
, 'password': registry_password
}
295 self
.mgr
.set_store('registry_credentials', json
.dumps(registry_credentials
))
297 self
.mgr
.set_module_option('registry_url', None)
298 self
.mgr
.check_mon_command({
299 'prefix': 'config rm',
301 'key': 'mgr/cephadm/registry_url',
303 self
.mgr
.set_module_option('registry_username', None)
304 self
.mgr
.check_mon_command({
305 'prefix': 'config rm',
307 'key': 'mgr/cephadm/registry_username',
309 self
.mgr
.set_module_option('registry_password', None)
310 self
.mgr
.check_mon_command({
311 'prefix': 'config rm',
313 'key': 'mgr/cephadm/registry_password',
316 self
.mgr
.log
.info('Done migrating registry login info')
320 def queue_migrate_nfs_spec(mgr
: "CephadmOrchestrator", spec_dict
: Dict
[Any
, Any
]) -> None:
322 After 16.2.5 we dropped the NFSServiceSpec pool and namespace properties.
323 Queue up a migration to process later, once we are sure that RADOS is available
326 service_id
= spec_dict
['spec']['service_id']
327 args
= spec_dict
['spec'].get('spec', {})
328 pool
= args
.pop('pool', 'nfs-ganesha')
329 ns
= args
.pop('namespace', service_id
)
330 queued
= mgr
.get_store('nfs_migration_queue') or '[]'
331 ls
= json
.loads(queued
)
332 ls
.append([service_id
, pool
, ns
])
333 mgr
.set_store('nfs_migration_queue', json
.dumps(ls
))
334 mgr
.log
.info(f
'Queued nfs.{service_id} for migration')