]> git.proxmox.com Git - ceph.git/blame - ceph/src/pybind/mgr/cephadm/migrations.py
import ceph quincy 17.2.4
[ceph.git] / ceph / src / pybind / mgr / cephadm / migrations.py
CommitLineData
a4b75251 1import json
f6b5b4d7 2import logging
a4b75251 3from typing import TYPE_CHECKING, Iterator, Optional, Dict, Any
f6b5b4d7
TL
4
5from ceph.deployment.service_spec import PlacementSpec, ServiceSpec, HostPlacementSpec
6from cephadm.schedule import HostAssignment
a4b75251 7import rados
f6b5b4d7 8
a4b75251 9from mgr_module import NFS_POOL_NAME
f67539c2 10from orchestrator import OrchestratorError, DaemonDescription
f6b5b4d7
TL
11
12if TYPE_CHECKING:
13 from .module import CephadmOrchestrator
14
20effc67 15LAST_MIGRATION = 5
f6b5b4d7
TL
16
17logger = logging.getLogger(__name__)
18
19
20class Migrations:
21 def __init__(self, mgr: "CephadmOrchestrator"):
22 self.mgr = mgr
23
24 # Why having a global counter, instead of spec versions?
25 #
26 # for the first migration:
27 # The specs don't change in (this) migration. but the scheduler here.
28 # Adding the version to the specs at this time just felt wrong to me.
29 #
30 # And the specs are only another part of cephadm which needs potential upgrades.
31 # We have the cache, the inventory, the config store, the upgrade (imagine changing the
32 # upgrade code, while an old upgrade is still in progress), naming of daemons,
33 # fs-layout of the daemons, etc.
34 if self.mgr.migration_current is None:
a4b75251
TL
35 self.set(LAST_MIGRATION)
36
37 v = mgr.get_store('nfs_migration_queue')
38 self.nfs_migration_queue = json.loads(v) if v else []
f6b5b4d7
TL
39
40 # for some migrations, we don't need to do anything except for
20effc67 41 # incrementing migration_current.
f6b5b4d7 42 # let's try to shortcut things here.
a4b75251 43 self.migrate(True)
f6b5b4d7 44
adb31ebb 45 def set(self, val: int) -> None:
f6b5b4d7
TL
46 self.mgr.set_module_option('migration_current', val)
47 self.mgr.migration_current = val
48
adb31ebb 49 def is_migration_ongoing(self) -> bool:
f6b5b4d7
TL
50 return self.mgr.migration_current != LAST_MIGRATION
51
adb31ebb 52 def verify_no_migration(self) -> None:
f6b5b4d7
TL
53 if self.is_migration_ongoing():
54 # this is raised in module.serve()
55 raise OrchestratorError(
56 "cephadm migration still ongoing. Please wait, until the migration is complete.")
57
a4b75251 58 def migrate(self, startup: bool = False) -> None:
f6b5b4d7
TL
59 if self.mgr.migration_current == 0:
60 if self.migrate_0_1():
61 self.set(1)
62
63 if self.mgr.migration_current == 1:
64 if self.migrate_1_2():
65 self.set(2)
66
a4b75251
TL
67 if self.mgr.migration_current == 2 and not startup:
68 if self.migrate_2_3():
69 self.set(3)
70
20effc67
TL
71 if self.mgr.migration_current == 3:
72 if self.migrate_3_4():
73 self.set(4)
74
75 if self.mgr.migration_current == 4:
76 if self.migrate_4_5():
77 self.set(5)
78
f6b5b4d7
TL
79 def migrate_0_1(self) -> bool:
80 """
81 Migration 0 -> 1
82 New scheduler that takes PlacementSpec as the bound and not as recommendation.
83 I.e. the new scheduler won't suggest any new placements outside of the hosts
84 specified by label etc.
85
86 Which means, we have to make sure, we're not removing any daemons directly after
87 upgrading to the new scheduler.
88
89 There is a potential race here:
90 1. user updates his spec to remove daemons
91 2. mgr gets upgraded to new scheduler, before the old scheduler removed the daemon
92 3. now, we're converting the spec to explicit placement, thus reverting (1.)
93 I think this is ok.
94 """
95
96 def interesting_specs() -> Iterator[ServiceSpec]:
f67539c2 97 for s in self.mgr.spec_store.all_specs.values():
f6b5b4d7
TL
98 if s.unmanaged:
99 continue
100 p = s.placement
101 if p is None:
102 continue
103 if p.count is None:
104 continue
105 if not p.hosts and not p.host_pattern and not p.label:
106 continue
107 yield s
108
109 def convert_to_explicit(spec: ServiceSpec) -> None:
f67539c2
TL
110 existing_daemons = self.mgr.cache.get_daemons_by_service(spec.service_name())
111 placements, to_add, to_remove = HostAssignment(
f6b5b4d7 112 spec=spec,
f91f0fd5 113 hosts=self.mgr.inventory.all_specs(),
20effc67 114 unreachable_hosts=self.mgr.cache.get_unreachable_hosts(),
2a845540 115 draining_hosts=self.mgr.cache.get_draining_hosts(),
f67539c2 116 daemons=existing_daemons,
f6b5b4d7
TL
117 ).place()
118
f6b5b4d7
TL
119 # We have to migrate, only if the new scheduler would remove daemons
120 if len(placements) >= len(existing_daemons):
121 return
122
f67539c2
TL
123 def to_hostname(d: DaemonDescription) -> HostPlacementSpec:
124 if d.hostname in old_hosts:
125 return old_hosts[d.hostname]
126 else:
127 assert d.hostname
128 return HostPlacementSpec(d.hostname, '', '')
129
f6b5b4d7 130 old_hosts = {h.hostname: h for h in spec.placement.hosts}
f67539c2 131 new_hosts = [to_hostname(d) for d in existing_daemons]
f6b5b4d7
TL
132
133 new_placement = PlacementSpec(
134 hosts=new_hosts,
135 count=spec.placement.count
136 )
137
138 new_spec = ServiceSpec.from_json(spec.to_json())
139 new_spec.placement = new_placement
140
141 logger.info(f"Migrating {spec.one_line_str()} to explicit placement")
142
143 self.mgr.spec_store.save(new_spec)
144
145 specs = list(interesting_specs())
146 if not specs:
147 return True # nothing to do. shortcut
148
149 if not self.mgr.cache.daemon_cache_filled():
150 logger.info("Unable to migrate yet. Daemon Cache still incomplete.")
151 return False
152
153 for spec in specs:
154 convert_to_explicit(spec)
155
156 return True
157
158 def migrate_1_2(self) -> bool:
159 """
160 After 15.2.4, we unified some service IDs: MONs, MGRs etc no longer have a service id.
161 Which means, the service names changed:
162
163 mon.foo -> mon
164 mgr.foo -> mgr
165
166 This fixes the data structure consistency
167 """
168 bad_specs = {}
f67539c2 169 for name, spec in self.mgr.spec_store.all_specs.items():
f6b5b4d7
TL
170 if name != spec.service_name():
171 bad_specs[name] = (spec.service_name(), spec)
172
173 for old, (new, old_spec) in bad_specs.items():
f67539c2 174 if new not in self.mgr.spec_store.all_specs:
f6b5b4d7
TL
175 spec = old_spec
176 else:
f67539c2 177 spec = self.mgr.spec_store.all_specs[new]
f6b5b4d7
TL
178 spec.unmanaged = True
179 self.mgr.spec_store.save(spec)
f67539c2 180 self.mgr.spec_store.finally_rm(old)
f6b5b4d7
TL
181
182 return True
a4b75251
TL
183
184 def migrate_2_3(self) -> bool:
185 if self.nfs_migration_queue:
186 from nfs.cluster import create_ganesha_pool
187
188 create_ganesha_pool(self.mgr)
189 for service_id, pool, ns in self.nfs_migration_queue:
190 if pool != '.nfs':
191 self.migrate_nfs_spec(service_id, pool, ns)
192 self.nfs_migration_queue = []
193 self.mgr.log.info('Done migrating all NFS services')
194 return True
195
196 def migrate_nfs_spec(self, service_id: str, pool: str, ns: Optional[str]) -> None:
197 renamed = False
198 if service_id.startswith('ganesha-'):
199 service_id = service_id[8:]
200 renamed = True
201
202 self.mgr.log.info(
203 f'Migrating nfs.{service_id} from legacy pool {pool} namespace {ns}'
204 )
205
206 # read exports
207 ioctx = self.mgr.rados.open_ioctx(pool)
208 if ns is not None:
209 ioctx.set_namespace(ns)
210 object_iterator = ioctx.list_objects()
211 exports = []
212 while True:
213 try:
214 obj = object_iterator.__next__()
215 if obj.key.startswith('export-'):
216 self.mgr.log.debug(f'reading {obj.key}')
217 exports.append(obj.read().decode())
218 except StopIteration:
219 break
220 self.mgr.log.info(f'Found {len(exports)} exports for legacy nfs.{service_id}')
221
222 # copy grace file
223 if service_id != ns:
224 try:
225 grace = ioctx.read("grace")
226 new_ioctx = self.mgr.rados.open_ioctx(NFS_POOL_NAME)
227 new_ioctx.set_namespace(service_id)
228 new_ioctx.write_full("grace", grace)
229 self.mgr.log.info('Migrated nfs-ganesha grace file')
230 except rados.ObjectNotFound:
231 self.mgr.log.debug('failed to read old grace file; skipping')
232
233 if renamed and f'nfs.ganesha-{service_id}' in self.mgr.spec_store:
234 # rename from nfs.ganesha-* to nfs.*. This will destroy old daemons and
235 # deploy new ones.
236 self.mgr.log.info(f'Replacing nfs.ganesha-{service_id} with nfs.{service_id}')
237 spec = self.mgr.spec_store[f'nfs.ganesha-{service_id}'].spec
238 self.mgr.spec_store.rm(f'nfs.ganesha-{service_id}')
239 spec.service_id = service_id
240 self.mgr.spec_store.save(spec, True)
20effc67
TL
241
242 # We have to remove the old daemons here as well, otherwise we'll end up with a port conflict.
243 daemons = [d.name()
244 for d in self.mgr.cache.get_daemons_by_service(f'nfs.ganesha-{service_id}')]
245 self.mgr.log.info(f'Removing old nfs.ganesha-{service_id} daemons {daemons}')
246 self.mgr.remove_daemons(daemons)
a4b75251
TL
247 else:
248 # redeploy all ganesha daemons to ensures that the daemon
249 # cephx are correct AND container configs are set up properly
250 daemons = [d.name() for d in self.mgr.cache.get_daemons_by_service(f'nfs.{service_id}')]
251 self.mgr.log.info(f'Removing old nfs.{service_id} daemons {daemons}')
252 self.mgr.remove_daemons(daemons)
253
254 # re-save service spec (without pool and namespace properties!)
255 spec = self.mgr.spec_store[f'nfs.{service_id}'].spec
256 self.mgr.spec_store.save(spec)
257
258 # import exports
259 for export in exports:
260 ex = ''
261 for line in export.splitlines():
262 if (
263 line.startswith(' secret_access_key =')
264 or line.startswith(' user_id =')
265 ):
266 continue
267 ex += line + '\n'
268 self.mgr.log.debug(f'importing export: {ex}')
269 ret, out, err = self.mgr.mon_command({
270 'prefix': 'nfs export apply',
271 'cluster_id': service_id
272 }, inbuf=ex)
273 if ret:
274 self.mgr.log.warning(f'Failed to migrate export ({ret}): {err}\nExport was:\n{ex}')
275 self.mgr.log.info(f'Done migrating nfs.{service_id}')
276
20effc67
TL
277 def migrate_3_4(self) -> bool:
278 # We can't set any host with the _admin label, but we're
279 # going to warn when calling `ceph orch host rm...`
280 if 'client.admin' not in self.mgr.keys.keys:
281 self.mgr._client_keyring_set(
282 entity='client.admin',
283 placement='label:_admin',
284 )
285 return True
286
287 def migrate_4_5(self) -> bool:
288 registry_url = self.mgr.get_module_option('registry_url')
289 registry_username = self.mgr.get_module_option('registry_username')
290 registry_password = self.mgr.get_module_option('registry_password')
291 if registry_url and registry_username and registry_password:
292
293 registry_credentials = {'url': registry_url,
294 'username': registry_username, 'password': registry_password}
295 self.mgr.set_store('registry_credentials', json.dumps(registry_credentials))
296
297 self.mgr.set_module_option('registry_url', None)
298 self.mgr.check_mon_command({
299 'prefix': 'config rm',
300 'who': 'mgr',
301 'key': 'mgr/cephadm/registry_url',
302 })
303 self.mgr.set_module_option('registry_username', None)
304 self.mgr.check_mon_command({
305 'prefix': 'config rm',
306 'who': 'mgr',
307 'key': 'mgr/cephadm/registry_username',
308 })
309 self.mgr.set_module_option('registry_password', None)
310 self.mgr.check_mon_command({
311 'prefix': 'config rm',
312 'who': 'mgr',
313 'key': 'mgr/cephadm/registry_password',
314 })
315
316 self.mgr.log.info('Done migrating registry login info')
317 return True
318
a4b75251
TL
319
320def queue_migrate_nfs_spec(mgr: "CephadmOrchestrator", spec_dict: Dict[Any, Any]) -> None:
321 """
322 After 16.2.5 we dropped the NFSServiceSpec pool and namespace properties.
323 Queue up a migration to process later, once we are sure that RADOS is available
324 and so on.
325 """
326 service_id = spec_dict['spec']['service_id']
327 args = spec_dict['spec'].get('spec', {})
328 pool = args.pop('pool', 'nfs-ganesha')
329 ns = args.pop('namespace', service_id)
330 queued = mgr.get_store('nfs_migration_queue') or '[]'
331 ls = json.loads(queued)
332 ls.append([service_id, pool, ns])
333 mgr.set_store('nfs_migration_queue', json.dumps(ls))
334 mgr.log.info(f'Queued nfs.{service_id} for migration')