]> git.proxmox.com Git - ceph.git/blame - ceph/src/pybind/mgr/cephadm/migrations.py
import quincy beta 17.1.0
[ceph.git] / ceph / src / pybind / mgr / cephadm / migrations.py
CommitLineData
a4b75251 1import json
f6b5b4d7 2import logging
a4b75251 3from typing import TYPE_CHECKING, Iterator, Optional, Dict, Any
f6b5b4d7
TL
4
5from ceph.deployment.service_spec import PlacementSpec, ServiceSpec, HostPlacementSpec
6from cephadm.schedule import HostAssignment
a4b75251 7import rados
f6b5b4d7 8
a4b75251 9from mgr_module import NFS_POOL_NAME
f67539c2 10from orchestrator import OrchestratorError, DaemonDescription
f6b5b4d7
TL
11
12if TYPE_CHECKING:
13 from .module import CephadmOrchestrator
14
20effc67 15LAST_MIGRATION = 5
f6b5b4d7
TL
16
17logger = logging.getLogger(__name__)
18
19
20class Migrations:
21 def __init__(self, mgr: "CephadmOrchestrator"):
22 self.mgr = mgr
23
24 # Why having a global counter, instead of spec versions?
25 #
26 # for the first migration:
27 # The specs don't change in (this) migration. but the scheduler here.
28 # Adding the version to the specs at this time just felt wrong to me.
29 #
30 # And the specs are only another part of cephadm which needs potential upgrades.
31 # We have the cache, the inventory, the config store, the upgrade (imagine changing the
32 # upgrade code, while an old upgrade is still in progress), naming of daemons,
33 # fs-layout of the daemons, etc.
34 if self.mgr.migration_current is None:
a4b75251
TL
35 self.set(LAST_MIGRATION)
36
37 v = mgr.get_store('nfs_migration_queue')
38 self.nfs_migration_queue = json.loads(v) if v else []
f6b5b4d7
TL
39
40 # for some migrations, we don't need to do anything except for
20effc67 41 # incrementing migration_current.
f6b5b4d7 42 # let's try to shortcut things here.
a4b75251 43 self.migrate(True)
f6b5b4d7 44
adb31ebb 45 def set(self, val: int) -> None:
f6b5b4d7
TL
46 self.mgr.set_module_option('migration_current', val)
47 self.mgr.migration_current = val
48
adb31ebb 49 def is_migration_ongoing(self) -> bool:
f6b5b4d7
TL
50 return self.mgr.migration_current != LAST_MIGRATION
51
adb31ebb 52 def verify_no_migration(self) -> None:
f6b5b4d7
TL
53 if self.is_migration_ongoing():
54 # this is raised in module.serve()
55 raise OrchestratorError(
56 "cephadm migration still ongoing. Please wait, until the migration is complete.")
57
a4b75251 58 def migrate(self, startup: bool = False) -> None:
f6b5b4d7
TL
59 if self.mgr.migration_current == 0:
60 if self.migrate_0_1():
61 self.set(1)
62
63 if self.mgr.migration_current == 1:
64 if self.migrate_1_2():
65 self.set(2)
66
a4b75251
TL
67 if self.mgr.migration_current == 2 and not startup:
68 if self.migrate_2_3():
69 self.set(3)
70
20effc67
TL
71 if self.mgr.migration_current == 3:
72 if self.migrate_3_4():
73 self.set(4)
74
75 if self.mgr.migration_current == 4:
76 if self.migrate_4_5():
77 self.set(5)
78
f6b5b4d7
TL
79 def migrate_0_1(self) -> bool:
80 """
81 Migration 0 -> 1
82 New scheduler that takes PlacementSpec as the bound and not as recommendation.
83 I.e. the new scheduler won't suggest any new placements outside of the hosts
84 specified by label etc.
85
86 Which means, we have to make sure, we're not removing any daemons directly after
87 upgrading to the new scheduler.
88
89 There is a potential race here:
90 1. user updates his spec to remove daemons
91 2. mgr gets upgraded to new scheduler, before the old scheduler removed the daemon
92 3. now, we're converting the spec to explicit placement, thus reverting (1.)
93 I think this is ok.
94 """
95
96 def interesting_specs() -> Iterator[ServiceSpec]:
f67539c2 97 for s in self.mgr.spec_store.all_specs.values():
f6b5b4d7
TL
98 if s.unmanaged:
99 continue
100 p = s.placement
101 if p is None:
102 continue
103 if p.count is None:
104 continue
105 if not p.hosts and not p.host_pattern and not p.label:
106 continue
107 yield s
108
109 def convert_to_explicit(spec: ServiceSpec) -> None:
f67539c2
TL
110 existing_daemons = self.mgr.cache.get_daemons_by_service(spec.service_name())
111 placements, to_add, to_remove = HostAssignment(
f6b5b4d7 112 spec=spec,
f91f0fd5 113 hosts=self.mgr.inventory.all_specs(),
20effc67 114 unreachable_hosts=self.mgr.cache.get_unreachable_hosts(),
f67539c2 115 daemons=existing_daemons,
f6b5b4d7
TL
116 ).place()
117
f6b5b4d7
TL
118 # We have to migrate, only if the new scheduler would remove daemons
119 if len(placements) >= len(existing_daemons):
120 return
121
f67539c2
TL
122 def to_hostname(d: DaemonDescription) -> HostPlacementSpec:
123 if d.hostname in old_hosts:
124 return old_hosts[d.hostname]
125 else:
126 assert d.hostname
127 return HostPlacementSpec(d.hostname, '', '')
128
f6b5b4d7 129 old_hosts = {h.hostname: h for h in spec.placement.hosts}
f67539c2 130 new_hosts = [to_hostname(d) for d in existing_daemons]
f6b5b4d7
TL
131
132 new_placement = PlacementSpec(
133 hosts=new_hosts,
134 count=spec.placement.count
135 )
136
137 new_spec = ServiceSpec.from_json(spec.to_json())
138 new_spec.placement = new_placement
139
140 logger.info(f"Migrating {spec.one_line_str()} to explicit placement")
141
142 self.mgr.spec_store.save(new_spec)
143
144 specs = list(interesting_specs())
145 if not specs:
146 return True # nothing to do. shortcut
147
148 if not self.mgr.cache.daemon_cache_filled():
149 logger.info("Unable to migrate yet. Daemon Cache still incomplete.")
150 return False
151
152 for spec in specs:
153 convert_to_explicit(spec)
154
155 return True
156
157 def migrate_1_2(self) -> bool:
158 """
159 After 15.2.4, we unified some service IDs: MONs, MGRs etc no longer have a service id.
160 Which means, the service names changed:
161
162 mon.foo -> mon
163 mgr.foo -> mgr
164
165 This fixes the data structure consistency
166 """
167 bad_specs = {}
f67539c2 168 for name, spec in self.mgr.spec_store.all_specs.items():
f6b5b4d7
TL
169 if name != spec.service_name():
170 bad_specs[name] = (spec.service_name(), spec)
171
172 for old, (new, old_spec) in bad_specs.items():
f67539c2 173 if new not in self.mgr.spec_store.all_specs:
f6b5b4d7
TL
174 spec = old_spec
175 else:
f67539c2 176 spec = self.mgr.spec_store.all_specs[new]
f6b5b4d7
TL
177 spec.unmanaged = True
178 self.mgr.spec_store.save(spec)
f67539c2 179 self.mgr.spec_store.finally_rm(old)
f6b5b4d7
TL
180
181 return True
a4b75251
TL
182
183 def migrate_2_3(self) -> bool:
184 if self.nfs_migration_queue:
185 from nfs.cluster import create_ganesha_pool
186
187 create_ganesha_pool(self.mgr)
188 for service_id, pool, ns in self.nfs_migration_queue:
189 if pool != '.nfs':
190 self.migrate_nfs_spec(service_id, pool, ns)
191 self.nfs_migration_queue = []
192 self.mgr.log.info('Done migrating all NFS services')
193 return True
194
195 def migrate_nfs_spec(self, service_id: str, pool: str, ns: Optional[str]) -> None:
196 renamed = False
197 if service_id.startswith('ganesha-'):
198 service_id = service_id[8:]
199 renamed = True
200
201 self.mgr.log.info(
202 f'Migrating nfs.{service_id} from legacy pool {pool} namespace {ns}'
203 )
204
205 # read exports
206 ioctx = self.mgr.rados.open_ioctx(pool)
207 if ns is not None:
208 ioctx.set_namespace(ns)
209 object_iterator = ioctx.list_objects()
210 exports = []
211 while True:
212 try:
213 obj = object_iterator.__next__()
214 if obj.key.startswith('export-'):
215 self.mgr.log.debug(f'reading {obj.key}')
216 exports.append(obj.read().decode())
217 except StopIteration:
218 break
219 self.mgr.log.info(f'Found {len(exports)} exports for legacy nfs.{service_id}')
220
221 # copy grace file
222 if service_id != ns:
223 try:
224 grace = ioctx.read("grace")
225 new_ioctx = self.mgr.rados.open_ioctx(NFS_POOL_NAME)
226 new_ioctx.set_namespace(service_id)
227 new_ioctx.write_full("grace", grace)
228 self.mgr.log.info('Migrated nfs-ganesha grace file')
229 except rados.ObjectNotFound:
230 self.mgr.log.debug('failed to read old grace file; skipping')
231
232 if renamed and f'nfs.ganesha-{service_id}' in self.mgr.spec_store:
233 # rename from nfs.ganesha-* to nfs.*. This will destroy old daemons and
234 # deploy new ones.
235 self.mgr.log.info(f'Replacing nfs.ganesha-{service_id} with nfs.{service_id}')
236 spec = self.mgr.spec_store[f'nfs.ganesha-{service_id}'].spec
237 self.mgr.spec_store.rm(f'nfs.ganesha-{service_id}')
238 spec.service_id = service_id
239 self.mgr.spec_store.save(spec, True)
20effc67
TL
240
241 # We have to remove the old daemons here as well, otherwise we'll end up with a port conflict.
242 daemons = [d.name()
243 for d in self.mgr.cache.get_daemons_by_service(f'nfs.ganesha-{service_id}')]
244 self.mgr.log.info(f'Removing old nfs.ganesha-{service_id} daemons {daemons}')
245 self.mgr.remove_daemons(daemons)
a4b75251
TL
246 else:
247 # redeploy all ganesha daemons to ensures that the daemon
248 # cephx are correct AND container configs are set up properly
249 daemons = [d.name() for d in self.mgr.cache.get_daemons_by_service(f'nfs.{service_id}')]
250 self.mgr.log.info(f'Removing old nfs.{service_id} daemons {daemons}')
251 self.mgr.remove_daemons(daemons)
252
253 # re-save service spec (without pool and namespace properties!)
254 spec = self.mgr.spec_store[f'nfs.{service_id}'].spec
255 self.mgr.spec_store.save(spec)
256
257 # import exports
258 for export in exports:
259 ex = ''
260 for line in export.splitlines():
261 if (
262 line.startswith(' secret_access_key =')
263 or line.startswith(' user_id =')
264 ):
265 continue
266 ex += line + '\n'
267 self.mgr.log.debug(f'importing export: {ex}')
268 ret, out, err = self.mgr.mon_command({
269 'prefix': 'nfs export apply',
270 'cluster_id': service_id
271 }, inbuf=ex)
272 if ret:
273 self.mgr.log.warning(f'Failed to migrate export ({ret}): {err}\nExport was:\n{ex}')
274 self.mgr.log.info(f'Done migrating nfs.{service_id}')
275
20effc67
TL
276 def migrate_3_4(self) -> bool:
277 # We can't set any host with the _admin label, but we're
278 # going to warn when calling `ceph orch host rm...`
279 if 'client.admin' not in self.mgr.keys.keys:
280 self.mgr._client_keyring_set(
281 entity='client.admin',
282 placement='label:_admin',
283 )
284 return True
285
286 def migrate_4_5(self) -> bool:
287 registry_url = self.mgr.get_module_option('registry_url')
288 registry_username = self.mgr.get_module_option('registry_username')
289 registry_password = self.mgr.get_module_option('registry_password')
290 if registry_url and registry_username and registry_password:
291
292 registry_credentials = {'url': registry_url,
293 'username': registry_username, 'password': registry_password}
294 self.mgr.set_store('registry_credentials', json.dumps(registry_credentials))
295
296 self.mgr.set_module_option('registry_url', None)
297 self.mgr.check_mon_command({
298 'prefix': 'config rm',
299 'who': 'mgr',
300 'key': 'mgr/cephadm/registry_url',
301 })
302 self.mgr.set_module_option('registry_username', None)
303 self.mgr.check_mon_command({
304 'prefix': 'config rm',
305 'who': 'mgr',
306 'key': 'mgr/cephadm/registry_username',
307 })
308 self.mgr.set_module_option('registry_password', None)
309 self.mgr.check_mon_command({
310 'prefix': 'config rm',
311 'who': 'mgr',
312 'key': 'mgr/cephadm/registry_password',
313 })
314
315 self.mgr.log.info('Done migrating registry login info')
316 return True
317
a4b75251
TL
318
319def queue_migrate_nfs_spec(mgr: "CephadmOrchestrator", spec_dict: Dict[Any, Any]) -> None:
320 """
321 After 16.2.5 we dropped the NFSServiceSpec pool and namespace properties.
322 Queue up a migration to process later, once we are sure that RADOS is available
323 and so on.
324 """
325 service_id = spec_dict['spec']['service_id']
326 args = spec_dict['spec'].get('spec', {})
327 pool = args.pop('pool', 'nfs-ganesha')
328 ns = args.pop('namespace', service_id)
329 queued = mgr.get_store('nfs_migration_queue') or '[]'
330 ls = json.loads(queued)
331 ls.append([service_id, pool, ns])
332 mgr.set_store('nfs_migration_queue', json.dumps(ls))
333 mgr.log.info(f'Queued nfs.{service_id} for migration')