]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/rook.py
import quincy beta 17.1.0
[ceph.git] / ceph / qa / tasks / rook.py
CommitLineData
b3b6e05e
TL
1"""
2Rook cluster task
3"""
4import argparse
5import configobj
6import contextlib
7import json
8import logging
9import os
10import yaml
11from io import BytesIO
12
13from tarfile import ReadError
14from tasks.ceph_manager import CephManager
15from teuthology import misc as teuthology
16from teuthology.config import config as teuth_config
17from teuthology.contextutil import safe_while
18from teuthology.orchestra import run
19from teuthology import contextutil
20from tasks.ceph import healthy
21from tasks.cephadm import update_archive_setting
22
23log = logging.getLogger(__name__)
24
20effc67
TL
25def path_to_examples(ctx, cluster_name : str) -> str:
26 for p in ['rook/deploy/examples/', 'rook/cluster/examples/kubernetes/ceph/']:
27 try:
28 ctx.rook[cluster_name].remote.get_file(p + 'operator.yaml')
29 return p
30 except:
31 pass
32 assert False, 'Path to examples not found'
b3b6e05e
TL
33
34def _kubectl(ctx, config, args, **kwargs):
35 cluster_name = config.get('cluster', 'ceph')
36 return ctx.rook[cluster_name].remote.run(
37 args=['kubectl'] + args,
38 **kwargs
39 )
40
41
42def shell(ctx, config):
43 """
44 Run command(s) inside the rook tools container.
45
46 tasks:
47 - kubeadm:
48 - rook:
49 - rook.shell:
50 - ceph -s
51
52 or
53
54 tasks:
55 - kubeadm:
56 - rook:
57 - rook.shell:
58 commands:
59 - ceph -s
60
61 """
62 if isinstance(config, list):
63 config = {'commands': config}
64 for cmd in config.get('commands', []):
65 if isinstance(cmd, str):
66 _shell(ctx, config, cmd.split(' '))
67 else:
68 _shell(ctx, config, cmd)
69
70
71def _shell(ctx, config, args, **kwargs):
72 cluster_name = config.get('cluster', 'ceph')
73 return _kubectl(
74 ctx, config,
75 [
76 '-n', 'rook-ceph',
77 'exec',
78 ctx.rook[cluster_name].toolbox, '--'
79 ] + args,
80 **kwargs
81 )
82
83
84@contextlib.contextmanager
85def rook_operator(ctx, config):
86 cluster_name = config['cluster']
87 rook_branch = config.get('rook_branch', 'master')
88 rook_git_url = config.get('rook_git_url', 'https://github.com/rook/rook')
89
90 log.info(f'Cloning {rook_git_url} branch {rook_branch}')
91 ctx.rook[cluster_name].remote.run(
92 args=[
93 'rm', '-rf', 'rook',
94 run.Raw('&&'),
95 'git',
96 'clone',
97 '--single-branch',
98 '--branch', rook_branch,
99 rook_git_url,
100 'rook',
101 ]
102 )
103
104 # operator.yaml
20effc67
TL
105 log.info(os.path.abspath(os.getcwd()))
106 object_methods = [method_name for method_name in dir(ctx.rook[cluster_name].remote)
107 if callable(getattr(ctx.rook[cluster_name].remote, method_name))]
108 log.info(object_methods)
b3b6e05e 109 operator_yaml = ctx.rook[cluster_name].remote.read_file(
20effc67 110 (path_to_examples(ctx, cluster_name) + 'operator.yaml')
b3b6e05e
TL
111 )
112 rook_image = config.get('rook_image')
113 if rook_image:
114 log.info(f'Patching operator to use image {rook_image}')
115 crs = list(yaml.load_all(operator_yaml, Loader=yaml.FullLoader))
116 assert len(crs) == 2
117 crs[1]['spec']['template']['spec']['containers'][0]['image'] = rook_image
118 operator_yaml = yaml.dump_all(crs)
119 ctx.rook[cluster_name].remote.write_file('operator.yaml', operator_yaml)
120
121 op_job = None
122 try:
123 log.info('Deploying operator')
124 _kubectl(ctx, config, [
125 'create',
20effc67
TL
126 '-f', (path_to_examples(ctx, cluster_name) + 'crds.yaml'),
127 '-f', (path_to_examples(ctx, cluster_name) + 'common.yaml'),
b3b6e05e
TL
128 '-f', 'operator.yaml',
129 ])
130
131 # on centos:
132 if teuthology.get_distro(ctx) == 'centos':
133 _kubectl(ctx, config, [
134 '-n', 'rook-ceph',
135 'set', 'env', 'deploy/rook-ceph-operator',
136 'ROOK_HOSTPATH_REQUIRES_PRIVILEGED=true'
137 ])
138
139 # wait for operator
140 op_name = None
141 with safe_while(sleep=10, tries=90, action="wait for operator") as proceed:
142 while not op_name and proceed():
143 p = _kubectl(
144 ctx, config,
145 ['-n', 'rook-ceph', 'get', 'pods', '-l', 'app=rook-ceph-operator'],
146 stdout=BytesIO(),
147 )
148 for line in p.stdout.getvalue().decode('utf-8').strip().splitlines():
149 name, ready, status, _ = line.split(None, 3)
150 if status == 'Running':
151 op_name = name
152 break
153
154 # log operator output
155 op_job = _kubectl(
156 ctx,
157 config,
158 ['-n', 'rook-ceph', 'logs', '-f', op_name],
159 wait=False,
160 logger=log.getChild('operator'),
161 )
162
163 yield
164
165 except Exception as e:
166 log.exception(e)
167 raise
168
169 finally:
170 log.info('Cleaning up rook operator')
171 _kubectl(ctx, config, [
172 'delete',
173 '-f', 'operator.yaml',
174 ])
175 if False:
176 # don't bother since we'll tear down k8s anyway (and this mysteriously
177 # fails sometimes when deleting some of the CRDs... not sure why!)
178 _kubectl(ctx, config, [
179 'delete',
20effc67 180 '-f', (path_to_examples() + 'common.yaml'),
b3b6e05e
TL
181 ])
182 _kubectl(ctx, config, [
183 'delete',
20effc67 184 '-f', (path_to_examples() + 'crds.yaml'),
b3b6e05e
TL
185 ])
186 ctx.rook[cluster_name].remote.run(args=['rm', '-rf', 'rook', 'operator.yaml'])
187 if op_job:
188 op_job.wait()
189 run.wait(
190 ctx.cluster.run(
191 args=[
192 'sudo', 'rm', '-rf', '/var/lib/rook'
193 ]
194 )
195 )
196
197
198@contextlib.contextmanager
199def ceph_log(ctx, config):
200 cluster_name = config['cluster']
201
202 log_dir = '/var/lib/rook/rook-ceph/log'
203 update_archive_setting(ctx, 'log', log_dir)
204
205 try:
206 yield
207
208 except Exception:
209 # we need to know this below
210 ctx.summary['success'] = False
211 raise
212
213 finally:
214 log.info('Checking cluster log for badness...')
215 def first_in_ceph_log(pattern, excludes):
216 """
217 Find the first occurrence of the pattern specified in the Ceph log,
218 Returns None if none found.
219
220 :param pattern: Pattern scanned for.
221 :param excludes: Patterns to ignore.
222 :return: First line of text (or None if not found)
223 """
224 args = [
225 'sudo',
226 'egrep', pattern,
227 f'{log_dir}/ceph.log',
228 ]
229 if excludes:
230 for exclude in excludes:
231 args.extend([run.Raw('|'), 'egrep', '-v', exclude])
232 args.extend([
233 run.Raw('|'), 'head', '-n', '1',
234 ])
235 r = ctx.rook[cluster_name].remote.run(
236 stdout=BytesIO(),
237 args=args,
238 )
239 stdout = r.stdout.getvalue().decode()
240 if stdout:
241 return stdout
242 return None
243
244 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
245 config.get('log-ignorelist')) is not None:
246 log.warning('Found errors (ERR|WRN|SEC) in cluster log')
247 ctx.summary['success'] = False
248 # use the most severe problem as the failure reason
249 if 'failure_reason' not in ctx.summary:
250 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
251 match = first_in_ceph_log(pattern, config['log-ignorelist'])
252 if match is not None:
253 ctx.summary['failure_reason'] = \
254 '"{match}" in cluster log'.format(
255 match=match.rstrip('\n'),
256 )
257 break
258
259 if ctx.archive is not None and \
260 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
261 # and logs
262 log.info('Compressing logs...')
263 run.wait(
264 ctx.cluster.run(
265 args=[
266 'sudo',
267 'find',
268 log_dir,
269 '-name',
270 '*.log',
271 '-print0',
272 run.Raw('|'),
273 'sudo',
274 'xargs',
275 '-0',
276 '--no-run-if-empty',
277 '--',
278 'gzip',
279 '--',
280 ],
281 wait=False,
282 ),
283 )
284
285 log.info('Archiving logs...')
286 path = os.path.join(ctx.archive, 'remote')
287 try:
288 os.makedirs(path)
289 except OSError:
290 pass
291 for remote in ctx.cluster.remotes.keys():
292 sub = os.path.join(path, remote.name)
293 try:
294 os.makedirs(sub)
295 except OSError:
296 pass
297 try:
298 teuthology.pull_directory(remote, log_dir,
299 os.path.join(sub, 'log'))
300 except ReadError:
301 pass
302
303
304def build_initial_config(ctx, config):
305 path = os.path.join(os.path.dirname(__file__), 'rook-ceph.conf')
306 conf = configobj.ConfigObj(path, file_error=True)
307
308 # overrides
309 for section, keys in config.get('conf',{}).items():
310 for key, value in keys.items():
311 log.info(" override: [%s] %s = %s" % (section, key, value))
312 if section not in conf:
313 conf[section] = {}
314 conf[section][key] = value
315
316 return conf
317
318
319@contextlib.contextmanager
320def rook_cluster(ctx, config):
321 cluster_name = config['cluster']
322
323 # count how many OSDs we'll create
324 num_devs = 0
325 num_hosts = 0
326 for remote in ctx.cluster.remotes.keys():
327 ls = remote.read_file('/scratch_devs').decode('utf-8').strip().splitlines()
328 num_devs += len(ls)
329 num_hosts += 1
330 ctx.rook[cluster_name].num_osds = num_devs
331
332 # config
20effc67
TL
333 ceph_conf = build_initial_config(ctx, config)
334 ceph_conf_fp = BytesIO()
335 ceph_conf.write(ceph_conf_fp)
336 log.info(f'Config:\n{ceph_conf_fp.getvalue()}')
337 _kubectl(ctx, ceph_conf, ['create', '-f', '-'], stdin=yaml.dump({
b3b6e05e
TL
338 'apiVersion': 'v1',
339 'kind': 'ConfigMap',
340 'metadata': {
341 'name': 'rook-config-override',
342 'namespace': 'rook-ceph'},
343 'data': {
20effc67 344 'config': ceph_conf_fp.getvalue()
b3b6e05e
TL
345 }
346 }))
347
348 # cluster
349 cluster = {
350 'apiVersion': 'ceph.rook.io/v1',
351 'kind': 'CephCluster',
352 'metadata': {'name': 'rook-ceph', 'namespace': 'rook-ceph'},
353 'spec': {
354 'cephVersion': {
355 'image': ctx.rook[cluster_name].image,
356 'allowUnsupported': True,
357 },
358 'dataDirHostPath': '/var/lib/rook',
359 'skipUpgradeChecks': True,
360 'mgr': {
361 'count': 1,
362 'modules': [
363 { 'name': 'rook', 'enabled': True },
364 ],
365 },
366 'mon': {
367 'count': num_hosts,
368 'allowMultiplePerNode': True,
369 },
b3b6e05e
TL
370 }
371 }
372 teuthology.deep_merge(cluster['spec'], config.get('spec', {}))
373
374 cluster_yaml = yaml.dump(cluster)
375 log.info(f'Cluster:\n{cluster_yaml}')
376 try:
377 ctx.rook[cluster_name].remote.write_file('cluster.yaml', cluster_yaml)
378 _kubectl(ctx, config, ['create', '-f', 'cluster.yaml'])
379 yield
380
381 except Exception as e:
382 log.exception(e)
383 raise
384
385 finally:
386 _kubectl(ctx, config, ['delete', '-f', 'cluster.yaml'], check_status=False)
387
388 # wait for cluster to shut down
389 log.info('Waiting for cluster to stop')
390 running = True
391 with safe_while(sleep=5, tries=100, action="wait for teardown") as proceed:
392 while running and proceed():
393 p = _kubectl(
394 ctx, config,
395 ['-n', 'rook-ceph', 'get', 'pods'],
396 stdout=BytesIO(),
397 )
398 running = False
399 for line in p.stdout.getvalue().decode('utf-8').strip().splitlines():
400 name, ready, status, _ = line.split(None, 3)
401 if (
402 name != 'NAME'
403 and not name.startswith('csi-')
404 and not name.startswith('rook-ceph-operator-')
405 and not name.startswith('rook-ceph-tools-')
406 ):
407 running = True
408 break
409
410 _kubectl(
411 ctx, config,
412 ['-n', 'rook-ceph', 'delete', 'configmap', 'rook-config-override'],
413 check_status=False,
414 )
415 ctx.rook[cluster_name].remote.run(args=['rm', '-f', 'cluster.yaml'])
416
417
418@contextlib.contextmanager
419def rook_toolbox(ctx, config):
420 cluster_name = config['cluster']
421 try:
422 _kubectl(ctx, config, [
423 'create',
20effc67 424 '-f', (path_to_examples(ctx, cluster_name) + 'toolbox.yaml'),
b3b6e05e
TL
425 ])
426
427 log.info('Waiting for tools container to start')
428 toolbox = None
429 with safe_while(sleep=5, tries=100, action="wait for toolbox") as proceed:
430 while not toolbox and proceed():
431 p = _kubectl(
432 ctx, config,
433 ['-n', 'rook-ceph', 'get', 'pods', '-l', 'app=rook-ceph-tools'],
434 stdout=BytesIO(),
435 )
436 for line in p.stdout.getvalue().decode('utf-8').strip().splitlines():
437 name, ready, status, _ = line.split(None, 3)
438 if status == 'Running':
439 toolbox = name
440 break
441 ctx.rook[cluster_name].toolbox = toolbox
442 yield
443
444 except Exception as e:
445 log.exception(e)
446 raise
447
448 finally:
449 _kubectl(ctx, config, [
450 'delete',
20effc67 451 '-f', (path_to_examples(ctx, cluster_name) + 'toolbox.yaml'),
b3b6e05e
TL
452 ], check_status=False)
453
454
20effc67
TL
455@contextlib.contextmanager
456def wait_for_orch(ctx, config):
457 log.info('Waiting for mgr/rook orchestrator to be available')
458 with safe_while(sleep=10, tries=90, action="check orch status") as proceed:
459 while proceed():
460 p = _shell(ctx, config, ['ceph', 'orch', 'status', '-f', 'json'],
461 stdout=BytesIO(),
462 check_status=False)
463 if p.exitstatus == 0:
464 r = json.loads(p.stdout.getvalue().decode('utf-8'))
465 if r.get('available') and r.get('backend') == 'rook':
466 log.info(' mgr/rook orchestrator is active')
467 break
468
469 yield
470
471
472@contextlib.contextmanager
473def rook_post_config(ctx, config):
474 try:
475 _shell(ctx, config, ['ceph', 'config', 'set', 'mgr', 'mgr/rook/storage_class',
476 'scratch'])
477 _shell(ctx, config, ['ceph', 'orch', 'apply', 'osd', '--all-available-devices'])
478 yield
479
480 except Exception as e:
481 log.exception(e)
482 raise
483
484 finally:
485 pass
486
487
b3b6e05e
TL
488@contextlib.contextmanager
489def wait_for_osds(ctx, config):
490 cluster_name = config.get('cluster', 'ceph')
491
492 want = ctx.rook[cluster_name].num_osds
493 log.info(f'Waiting for {want} OSDs')
494 with safe_while(sleep=10, tries=90, action="check osd count") as proceed:
495 while proceed():
496 p = _shell(ctx, config, ['ceph', 'osd', 'stat', '-f', 'json'],
497 stdout=BytesIO(),
498 check_status=False)
499 if p.exitstatus == 0:
500 r = json.loads(p.stdout.getvalue().decode('utf-8'))
501 have = r.get('num_up_osds', 0)
502 if have == want:
503 break
504 log.info(f' have {have}/{want} OSDs')
505
506 yield
507
b3b6e05e
TL
508@contextlib.contextmanager
509def ceph_config_keyring(ctx, config):
510 # get config and push to hosts
511 log.info('Distributing ceph config and client.admin keyring')
512 p = _shell(ctx, config, ['cat', '/etc/ceph/ceph.conf'], stdout=BytesIO())
513 conf = p.stdout.getvalue()
514 p = _shell(ctx, config, ['cat', '/etc/ceph/keyring'], stdout=BytesIO())
515 keyring = p.stdout.getvalue()
516 ctx.cluster.run(args=['sudo', 'mkdir', '-p', '/etc/ceph'])
517 for remote in ctx.cluster.remotes.keys():
518 remote.write_file(
519 '/etc/ceph/ceph.conf',
520 conf,
521 sudo=True,
522 )
523 remote.write_file(
524 '/etc/ceph/keyring',
525 keyring,
526 sudo=True,
527 )
528
529 try:
530 yield
531
532 except Exception as e:
533 log.exception(e)
534 raise
535
536 finally:
537 log.info('Cleaning up config and client.admin keyring')
538 ctx.cluster.run(args=[
539 'sudo', 'rm', '-f',
540 '/etc/ceph/ceph.conf',
541 '/etc/ceph/ceph.client.admin.keyring'
542 ])
543
544
545@contextlib.contextmanager
546def ceph_clients(ctx, config):
547 cluster_name = config['cluster']
548
549 log.info('Setting up client nodes...')
550 clients = ctx.cluster.only(teuthology.is_type('client', cluster_name))
551 for remote, roles_for_host in clients.remotes.items():
552 for role in teuthology.cluster_roles_of_type(roles_for_host, 'client',
553 cluster_name):
554 name = teuthology.ceph_role(role)
555 client_keyring = '/etc/ceph/{0}.{1}.keyring'.format(cluster_name,
556 name)
557 r = _shell(ctx, config,
558 args=[
559 'ceph', 'auth',
560 'get-or-create', name,
561 'mon', 'allow *',
562 'osd', 'allow *',
563 'mds', 'allow *',
564 'mgr', 'allow *',
565 ],
566 stdout=BytesIO(),
567 )
568 keyring = r.stdout.getvalue()
569 remote.write_file(client_keyring, keyring, sudo=True, mode='0644')
570 yield
571
572
573@contextlib.contextmanager
574def task(ctx, config):
575 """
576 Deploy rook-ceph cluster
577
578 tasks:
579 - kubeadm:
580 - rook:
581 branch: wip-foo
582 spec:
583 mon:
584 count: 1
585
586 The spec item is deep-merged against the cluster.yaml. The branch, sha1, or
587 image items are used to determine the Ceph container image.
588 """
589 if not config:
590 config = {}
591 assert isinstance(config, dict), \
592 "task only supports a dictionary for configuration"
593
594 log.info('Rook start')
595
596 overrides = ctx.config.get('overrides', {})
597 teuthology.deep_merge(config, overrides.get('ceph', {}))
598 teuthology.deep_merge(config, overrides.get('rook', {}))
599 log.info('Config: ' + str(config))
600
601 # set up cluster context
602 if not hasattr(ctx, 'rook'):
603 ctx.rook = {}
604 if 'cluster' not in config:
605 config['cluster'] = 'ceph'
606 cluster_name = config['cluster']
607 if cluster_name not in ctx.rook:
608 ctx.rook[cluster_name] = argparse.Namespace()
609
610 ctx.rook[cluster_name].remote = list(ctx.cluster.remotes.keys())[0]
611
612 # image
613 teuth_defaults = teuth_config.get('defaults', {})
614 cephadm_defaults = teuth_defaults.get('cephadm', {})
615 containers_defaults = cephadm_defaults.get('containers', {})
616 container_image_name = containers_defaults.get('image', None)
617 if 'image' in config:
618 ctx.rook[cluster_name].image = config.get('image')
619 else:
620 sha1 = config.get('sha1')
621 flavor = config.get('flavor', 'default')
622 if sha1:
623 if flavor == "crimson":
624 ctx.rook[cluster_name].image = container_image_name + ':' + sha1 + '-' + flavor
625 else:
626 ctx.rook[cluster_name].image = container_image_name + ':' + sha1
627 else:
628 # hmm, fall back to branch?
629 branch = config.get('branch', 'master')
630 ctx.rook[cluster_name].image = container_image_name + ':' + branch
631 log.info('Ceph image is %s' % ctx.rook[cluster_name].image)
632
633 with contextutil.nested(
634 lambda: rook_operator(ctx, config),
635 lambda: ceph_log(ctx, config),
636 lambda: rook_cluster(ctx, config),
637 lambda: rook_toolbox(ctx, config),
20effc67
TL
638 lambda: wait_for_orch(ctx, config),
639 lambda: rook_post_config(ctx, config),
b3b6e05e
TL
640 lambda: wait_for_osds(ctx, config),
641 lambda: ceph_config_keyring(ctx, config),
642 lambda: ceph_clients(ctx, config),
643 ):
644 if not hasattr(ctx, 'managers'):
645 ctx.managers = {}
646 ctx.managers[cluster_name] = CephManager(
647 ctx.rook[cluster_name].remote,
648 ctx=ctx,
649 logger=log.getChild('ceph_manager.' + cluster_name),
650 cluster=cluster_name,
651 rook=True,
652 )
653 try:
654 if config.get('wait-for-healthy', True):
655 healthy(ctx=ctx, config=config)
656 log.info('Rook complete, yielding')
657 yield
658
659 finally:
20effc67
TL
660 to_remove = []
661 ret = _shell(ctx, config, ['ceph', 'orch', 'ls', '-f', 'json'], stdout=BytesIO())
662 if ret.exitstatus == 0:
663 r = json.loads(ret.stdout.getvalue().decode('utf-8'))
664 for service in r:
665 if service['service_type'] in ['rgw', 'mds', 'nfs', 'rbd-mirror']:
666 _shell(ctx, config, ['ceph', 'orch', 'rm', service['service_name']])
667 to_remove.append(service['service_name'])
668 with safe_while(sleep=10, tries=90, action="waiting for service removal") as proceed:
669 while proceed():
670 ret = _shell(ctx, config, ['ceph', 'orch', 'ls', '-f', 'json'], stdout=BytesIO())
671 if ret.exitstatus == 0:
672 r = json.loads(ret.stdout.getvalue().decode('utf-8'))
673 still_up = [service['service_name'] for service in r]
674 matches = set(still_up).intersection(to_remove)
675 if not matches:
676 break
b3b6e05e 677 log.info('Tearing down rook')