]>
git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/ceph_deploy.py
2 Execute ceph-deploy as a task
4 from cStringIO
import StringIO
12 from teuthology
import misc
as teuthology
13 from teuthology
import contextutil
14 from teuthology
.config
import config
as teuth_config
15 from teuthology
.task
import install
as install_fn
16 from teuthology
.orchestra
import run
17 from tasks
.cephfs
.filesystem
import Filesystem
19 log
= logging
.getLogger(__name__
)
22 @contextlib.contextmanager
23 def download_ceph_deploy(ctx
, config
):
25 Downloads ceph-deploy from the ceph.com git mirror and (by default)
26 switches to the master branch. If the `ceph-deploy-branch` is specified, it
27 will use that instead. The `bootstrap` script is ran, with the argument
28 obtained from `python_version`, if specified.
30 ceph_admin
= ctx
.cluster
.only(teuthology
.get_first_mon(ctx
, config
))
33 py_ver
= str(config
['python_version'])
37 supported_versions
= ['2', '3']
38 if py_ver
not in supported_versions
:
39 raise ValueError("python_version must be: {}, not {}".format(
40 ' or '.join(supported_versions
), py_ver
43 log
.info("Installing Python")
44 for admin
in ceph_admin
.remotes
:
45 system_type
= teuthology
.get_system_type(admin
)
47 if system_type
== 'rpm':
48 package
= 'python34' if py_ver
== '3' else 'python'
49 ctx
.cluster
.run(args
=[
50 'sudo', 'yum', '-y', 'install',
51 package
, 'python-virtualenv'
54 package
= 'python3' if py_ver
== '3' else 'python'
55 ctx
.cluster
.run(args
=[
56 'sudo', 'apt-get', '-y', '--force-yes', 'install',
57 package
, 'python-virtualenv'
60 log
.info('Downloading ceph-deploy...')
61 testdir
= teuthology
.get_testdir(ctx
)
62 ceph_deploy_branch
= config
.get('ceph-deploy-branch', 'master')
66 'git', 'clone', '-b', ceph_deploy_branch
,
67 teuth_config
.ceph_git_base_url
+ 'ceph-deploy.git',
68 '{tdir}/ceph-deploy'.format(tdir
=testdir
),
73 '{tdir}/ceph-deploy'.format(tdir
=testdir
),
78 args
.append(str(config
['python_version']))
81 ceph_admin
.run(args
=args
)
86 log
.info('Removing ceph-deploy ...')
91 '{tdir}/ceph-deploy'.format(tdir
=testdir
),
96 def is_healthy(ctx
, config
):
97 """Wait until a Ceph cluster is healthy."""
98 testdir
= teuthology
.get_testdir(ctx
)
99 ceph_admin
= teuthology
.get_first_mon(ctx
, config
)
100 (remote
,) = ctx
.cluster
.only(ceph_admin
).remotes
.keys()
101 max_tries
= 90 # 90 tries * 10 secs --> 15 minutes
105 if tries
>= max_tries
:
106 msg
= "ceph health was unable to get 'HEALTH_OK' after waiting 15 minutes"
110 '{tdir}'.format(tdir
=testdir
),
116 raise RuntimeError(msg
)
121 '{tdir}'.format(tdir
=testdir
),
127 logger
=log
.getChild('health'),
129 out
= r
.stdout
.getvalue()
130 log
.info('Ceph health: %s', out
.rstrip('\n'))
131 if out
.split(None, 1)[0] == 'HEALTH_OK':
136 def get_nodes_using_role(ctx
, target_role
):
138 Extract the names of nodes that match a given role from a cluster, and modify the
139 cluster's service IDs to match the resulting node-based naming scheme that ceph-deploy
140 uses, such that if "mon.a" is on host "foo23", it'll be renamed to "mon.foo23".
143 # Nodes containing a service of the specified role
144 nodes_of_interest
= []
146 # Prepare a modified version of cluster.remotes with ceph-deploy-ized names
147 modified_remotes
= {}
149 for _remote
, roles_for_host
in ctx
.cluster
.remotes
.iteritems():
150 modified_remotes
[_remote
] = []
151 for svc_id
in roles_for_host
:
152 if svc_id
.startswith("{0}.".format(target_role
)):
153 fqdn
= str(_remote
).split('@')[-1]
154 nodename
= str(str(_remote
).split('.')[0]).split('@')[1]
155 if target_role
== 'mon':
156 nodes_of_interest
.append(fqdn
)
158 nodes_of_interest
.append(nodename
)
160 modified_remotes
[_remote
].append(
161 "{0}.{1}".format(target_role
, nodename
))
163 modified_remotes
[_remote
].append(svc_id
)
165 ctx
.cluster
.remotes
= modified_remotes
167 return nodes_of_interest
170 def get_dev_for_osd(ctx
, config
):
171 """Get a list of all osd device names."""
173 for remote
, roles_for_host
in ctx
.cluster
.remotes
.iteritems():
174 host
= remote
.name
.split('@')[-1]
175 shortname
= host
.split('.')[0]
176 devs
= teuthology
.get_scratch_devices(remote
)
177 num_osd_per_host
= list(
178 teuthology
.roles_of_type(
179 roles_for_host
, 'osd'))
180 num_osds
= len(num_osd_per_host
)
181 if config
.get('separate_journal_disk') is not None:
182 num_devs_reqd
= 2 * num_osds
183 assert num_devs_reqd
<= len(
184 devs
), 'fewer data and journal disks than required ' + shortname
185 for dindex
in range(0, num_devs_reqd
, 2):
186 jd_index
= dindex
+ 1
187 dev_short
= devs
[dindex
].split('/')[-1]
188 jdev_short
= devs
[jd_index
].split('/')[-1]
189 osd_devs
.append((shortname
, dev_short
, jdev_short
))
191 assert num_osds
<= len(devs
), 'fewer disks than osds ' + shortname
192 for dev
in devs
[:num_osds
]:
193 dev_short
= dev
.split('/')[-1]
194 osd_devs
.append((shortname
, dev_short
))
198 def get_all_nodes(ctx
, config
):
199 """Return a string of node names separated by blanks"""
201 for t
, k
in ctx
.config
['targets'].iteritems():
202 host
= t
.split('@')[-1]
203 simple_host
= host
.split('.')[0]
204 nodelist
.append(simple_host
)
205 nodelist
= " ".join(nodelist
)
209 @contextlib.contextmanager
210 def build_ceph_cluster(ctx
, config
):
211 """Build a ceph cluster"""
213 # Expect to find ceph_admin on the first mon by ID, same place that the download task
214 # puts it. Remember this here, because subsequently IDs will change from those in
215 # the test config to those that ceph-deploy invents.
216 (ceph_admin
,) = ctx
.cluster
.only(
217 teuthology
.get_first_mon(ctx
, config
)).remotes
.iterkeys()
219 def execute_ceph_deploy(cmd
):
220 """Remotely execute a ceph_deploy command"""
221 return ceph_admin
.run(
224 '{tdir}/ceph-deploy'.format(tdir
=testdir
),
232 log
.info('Building ceph cluster using ceph-deploy...')
233 testdir
= teuthology
.get_testdir(ctx
)
235 if config
.get('branch') is not None:
236 cbranch
= config
.get('branch')
237 for var
, val
in cbranch
.iteritems():
238 ceph_branch
= '--{var}={val}'.format(var
=var
, val
=val
)
239 all_nodes
= get_all_nodes(ctx
, config
)
240 mds_nodes
= get_nodes_using_role(ctx
, 'mds')
241 mds_nodes
= " ".join(mds_nodes
)
242 mon_node
= get_nodes_using_role(ctx
, 'mon')
243 mon_nodes
= " ".join(mon_node
)
244 mgr_nodes
= get_nodes_using_role(ctx
, 'mgr')
245 mgr_nodes
= " ".join(mgr_nodes
)
246 new_mon
= './ceph-deploy new' + " " + mon_nodes
247 mgr_create
= './ceph-deploy mgr create' + " " + mgr_nodes
248 mon_hostname
= mon_nodes
.split(' ')[0]
249 mon_hostname
= str(mon_hostname
)
250 gather_keys
= './ceph-deploy gatherkeys' + " " + mon_hostname
251 deploy_mds
= './ceph-deploy mds create' + " " + mds_nodes
254 if mon_nodes
is None:
255 raise RuntimeError("no monitor nodes in the config file")
257 estatus_new
= execute_ceph_deploy(new_mon
)
259 raise RuntimeError("ceph-deploy: new command failed")
261 log
.info('adding config inputs...')
262 testdir
= teuthology
.get_testdir(ctx
)
263 conf_path
= '{tdir}/ceph-deploy/ceph.conf'.format(tdir
=testdir
)
265 if config
.get('conf') is not None:
266 confp
= config
.get('conf')
267 for section
, keys
in confp
.iteritems():
268 lines
= '[{section}]\n'.format(section
=section
)
269 teuthology
.append_lines_to_file(ceph_admin
, conf_path
, lines
,
271 for key
, value
in keys
.iteritems():
272 log
.info("[%s] %s = %s" % (section
, key
, value
))
273 lines
= '{key} = {value}\n'.format(key
=key
, value
=value
)
274 teuthology
.append_lines_to_file(
275 ceph_admin
, conf_path
, lines
, sudo
=True)
278 dev_branch
= ctx
.config
['branch']
279 branch
= '--dev={branch}'.format(branch
=dev_branch
)
284 install_nodes
= './ceph-deploy install ' + option
+ " " + all_nodes
285 estatus_install
= execute_ceph_deploy(install_nodes
)
286 if estatus_install
!= 0:
287 raise RuntimeError("ceph-deploy: Failed to install ceph")
288 # install ceph-test package too
289 install_nodes2
= './ceph-deploy install --tests ' + option
+ \
291 estatus_install
= execute_ceph_deploy(install_nodes2
)
292 if estatus_install
!= 0:
293 raise RuntimeError("ceph-deploy: Failed to install ceph-test")
295 mon_create_nodes
= './ceph-deploy mon create-initial'
296 # If the following fails, it is OK, it might just be that the monitors
297 # are taking way more than a minute/monitor to form quorum, so lets
298 # try the next block which will wait up to 15 minutes to gatherkeys.
299 execute_ceph_deploy(mon_create_nodes
)
300 execute_ceph_deploy(mgr_create
)
302 # create-keys is explicit now
303 # http://tracker.ceph.com/issues/16036
304 mons
= ctx
.cluster
.only(teuthology
.is_type('mon'))
305 for remote
in mons
.remotes
.iterkeys():
306 remote
.run(args
=['sudo', 'ceph-create-keys', '--cluster', 'ceph',
307 '--id', remote
.shortname
])
309 estatus_gather
= execute_ceph_deploy(gather_keys
)
311 estatus_mds
= execute_ceph_deploy(deploy_mds
)
313 raise RuntimeError("ceph-deploy: Failed to deploy mds")
315 if config
.get('test_mon_destroy') is not None:
316 for d
in range(1, len(mon_node
)):
317 mon_destroy_nodes
= './ceph-deploy mon destroy' + \
319 estatus_mon_d
= execute_ceph_deploy(mon_destroy_nodes
)
320 if estatus_mon_d
!= 0:
321 raise RuntimeError("ceph-deploy: Failed to delete monitor")
323 node_dev_list
= get_dev_for_osd(ctx
, config
)
324 for d
in node_dev_list
:
327 zap
= './ceph-deploy disk zap ' + node
+ ':' + disk
328 estatus
= execute_ceph_deploy(zap
)
330 raise RuntimeError("ceph-deploy: Failed to zap osds")
331 osd_create_cmd
= './ceph-deploy osd create '
332 # first check for filestore, default is bluestore with ceph-deploy
333 if config
.get('filestore') is not None:
334 osd_create_cmd
+= '--filestore '
336 osd_create_cmd
+= '--bluestore '
337 if config
.get('dmcrypt') is not None:
338 osd_create_cmd
+= '--dmcrypt '
339 osd_create_cmd
+= ":".join(d
)
340 estatus_osd
= execute_ceph_deploy(osd_create_cmd
)
342 log
.info('successfully created osd')
345 raise RuntimeError("ceph-deploy: Failed to create osds")
347 if config
.get('wait-for-healthy', True) and no_of_osds
>= 2:
348 is_healthy(ctx
=ctx
, config
=None)
350 log
.info('Setting up client nodes...')
351 conf_path
= '/etc/ceph/ceph.conf'
352 admin_keyring_path
= '/etc/ceph/ceph.client.admin.keyring'
353 first_mon
= teuthology
.get_first_mon(ctx
, config
)
354 (mon0_remote
,) = ctx
.cluster
.only(first_mon
).remotes
.keys()
355 conf_data
= teuthology
.get_file(
360 admin_keyring
= teuthology
.get_file(
362 path
=admin_keyring_path
,
366 clients
= ctx
.cluster
.only(teuthology
.is_type('client'))
367 for remot
, roles_for_host
in clients
.remotes
.iteritems():
368 for id_
in teuthology
.roles_of_type(roles_for_host
, 'client'):
370 '/etc/ceph/ceph.client.{id}.keyring'.format(id=id_
)
374 '{tdir}'.format(tdir
=testdir
),
376 'sudo', 'bash', '-c',
377 run
.Raw('"'), 'ceph',
380 'client.{id}'.format(id=id_
),
389 key_data
= teuthology
.get_file(
394 teuthology
.sudo_write_file(
400 teuthology
.sudo_write_file(
402 path
=admin_keyring_path
,
406 teuthology
.sudo_write_file(
414 log
.info('Configuring CephFS...')
415 ceph_fs
= Filesystem(ctx
, create
=True)
416 elif not config
.get('only_mon'):
418 "The cluster is NOT operational due to insufficient OSDs")
423 "Error encountered, logging exception before tearing down ceph-deploy")
424 log
.info(traceback
.format_exc())
427 if config
.get('keep_running'):
429 log
.info('Stopping ceph...')
430 ctx
.cluster
.run(args
=['sudo', 'stop', 'ceph-all', run
.Raw('||'),
431 'sudo', 'service', 'ceph', 'stop', run
.Raw('||'),
432 'sudo', 'systemctl', 'stop', 'ceph.target'])
434 # Are you really not running anymore?
435 # try first with the init tooling
436 # ignoring the status so this becomes informational only
439 'sudo', 'status', 'ceph-all', run
.Raw('||'),
440 'sudo', 'service', 'ceph', 'status', run
.Raw('||'),
441 'sudo', 'systemctl', 'status', 'ceph.target'],
444 # and now just check for the processes themselves, as if upstart/sysvinit
445 # is lying to us. Ignore errors if the grep fails
446 ctx
.cluster
.run(args
=['sudo', 'ps', 'aux', run
.Raw('|'),
447 'grep', '-v', 'grep', run
.Raw('|'),
448 'grep', 'ceph'], check_status
=False)
450 if ctx
.archive
is not None:
451 # archive mon data, too
452 log
.info('Archiving mon data...')
453 path
= os
.path
.join(ctx
.archive
, 'data')
455 mons
= ctx
.cluster
.only(teuthology
.is_type('mon'))
456 for remote
, roles
in mons
.remotes
.iteritems():
458 if role
.startswith('mon.'):
459 teuthology
.pull_directory_tarball(
462 path
+ '/' + role
+ '.tgz')
464 log
.info('Compressing logs...')
487 log
.info('Archiving logs...')
488 path
= os
.path
.join(ctx
.archive
, 'remote')
490 for remote
in ctx
.cluster
.remotes
.iterkeys():
491 sub
= os
.path
.join(path
, remote
.shortname
)
493 teuthology
.pull_directory(remote
, '/var/log/ceph',
494 os
.path
.join(sub
, 'log'))
496 # Prevent these from being undefined if the try block fails
497 all_nodes
= get_all_nodes(ctx
, config
)
498 purge_nodes
= './ceph-deploy purge' + " " + all_nodes
499 purgedata_nodes
= './ceph-deploy purgedata' + " " + all_nodes
501 log
.info('Purging package...')
502 execute_ceph_deploy(purge_nodes
)
503 log
.info('Purging data...')
504 execute_ceph_deploy(purgedata_nodes
)
507 @contextlib.contextmanager
508 def cli_test(ctx
, config
):
510 ceph-deploy cli to exercise most commonly use cli's and ensure
511 all commands works and also startup the init system.
514 log
.info('Ceph-deploy Test')
518 conf_dir
= teuthology
.get_testdir(ctx
) + "/cdtest"
520 def execute_cdeploy(admin
, cmd
, path
):
521 """Execute ceph-deploy commands """
522 """Either use git path or repo path """
523 args
= ['cd', conf_dir
, run
.Raw(';')]
525 args
.append('{path}/ceph-deploy/ceph-deploy'.format(path
=path
));
527 args
.append('ceph-deploy')
528 args
.append(run
.Raw(cmd
))
529 ec
= admin
.run(args
=args
, check_status
=False).exitstatus
532 "failed during ceph-deploy cmd: {cmd} , ec={ec}".format(cmd
=cmd
, ec
=ec
))
534 if config
.get('rhbuild'):
537 path
= teuthology
.get_testdir(ctx
)
538 # test on branch from config eg: wip-* , master or next etc
539 # packages for all distro's should exist for wip*
540 if ctx
.config
.get('branch'):
541 branch
= ctx
.config
.get('branch')
542 test_branch
= ' --dev={branch} '.format(branch
=branch
)
543 mons
= ctx
.cluster
.only(teuthology
.is_type('mon'))
544 for node
, role
in mons
.remotes
.iteritems():
546 admin
.run(args
=['mkdir', conf_dir
], check_status
=False)
547 nodename
= admin
.shortname
548 system_type
= teuthology
.get_system_type(admin
)
549 if config
.get('rhbuild'):
550 admin
.run(args
=['sudo', 'yum', 'install', 'ceph-deploy', '-y'])
551 log
.info('system type is %s', system_type
)
552 osds
= ctx
.cluster
.only(teuthology
.is_type('osd'))
554 for remote
, roles
in osds
.remotes
.iteritems():
555 devs
= teuthology
.get_scratch_devices(remote
)
556 log
.info("roles %s", roles
)
559 'Test needs minimum of 3 devices, only found %s',
561 raise RuntimeError("Needs minimum of 3 devices ")
563 conf_path
= '{conf_dir}/ceph.conf'.format(conf_dir
=conf_dir
)
564 new_cmd
= 'new ' + nodename
565 execute_cdeploy(admin
, new_cmd
, path
)
566 if config
.get('conf') is not None:
567 confp
= config
.get('conf')
568 for section
, keys
in confp
.iteritems():
569 lines
= '[{section}]\n'.format(section
=section
)
570 teuthology
.append_lines_to_file(admin
, conf_path
, lines
,
572 for key
, value
in keys
.iteritems():
573 log
.info("[%s] %s = %s" % (section
, key
, value
))
574 lines
= '{key} = {value}\n'.format(key
=key
, value
=value
)
575 teuthology
.append_lines_to_file(admin
, conf_path
, lines
,
577 new_mon_install
= 'install {branch} --mon '.format(
578 branch
=test_branch
) + nodename
579 new_mgr_install
= 'install {branch} --mgr '.format(
580 branch
=test_branch
) + nodename
581 new_osd_install
= 'install {branch} --osd '.format(
582 branch
=test_branch
) + nodename
583 new_admin
= 'install {branch} --cli '.format(branch
=test_branch
) + nodename
584 create_initial
= 'mon create-initial '
585 # either use create-keys or push command
586 push_keys
= 'admin ' + nodename
587 execute_cdeploy(admin
, new_mon_install
, path
)
588 execute_cdeploy(admin
, new_mgr_install
, path
)
589 execute_cdeploy(admin
, new_osd_install
, path
)
590 execute_cdeploy(admin
, new_admin
, path
)
591 execute_cdeploy(admin
, create_initial
, path
)
592 execute_cdeploy(admin
, push_keys
, path
)
595 zap_disk
= 'disk zap ' + "{n}:{d}".format(n
=nodename
, d
=devs
[i
])
596 prepare
= 'osd prepare ' + "{n}:{d}".format(n
=nodename
, d
=devs
[i
])
597 execute_cdeploy(admin
, zap_disk
, path
)
598 execute_cdeploy(admin
, prepare
, path
)
600 log
.info("list files for debugging purpose to check file permissions")
601 admin
.run(args
=['ls', run
.Raw('-lt'), conf_dir
])
602 remote
.run(args
=['sudo', 'ceph', '-s'], check_status
=False)
603 r
= remote
.run(args
=['sudo', 'ceph', 'health'], stdout
=StringIO())
604 out
= r
.stdout
.getvalue()
605 log
.info('Ceph health: %s', out
.rstrip('\n'))
606 log
.info("Waiting for cluster to become healthy")
607 with contextutil
.safe_while(sleep
=10, tries
=6,
608 action
='check health') as proceed
:
610 r
= remote
.run(args
=['sudo', 'ceph', 'health'], stdout
=StringIO())
611 out
= r
.stdout
.getvalue()
612 if (out
.split(None,1)[0] == 'HEALTH_OK'):
614 rgw_install
= 'install {branch} --rgw {node}'.format(
618 rgw_create
= 'rgw create ' + nodename
619 execute_cdeploy(admin
, rgw_install
, path
)
620 execute_cdeploy(admin
, rgw_create
, path
)
621 log
.info('All ceph-deploy cli tests passed')
625 log
.info("cleaning up")
626 ctx
.cluster
.run(args
=['sudo', 'stop', 'ceph-all', run
.Raw('||'),
627 'sudo', 'service', 'ceph', 'stop', run
.Raw('||'),
628 'sudo', 'systemctl', 'stop', 'ceph.target'],
632 umount_dev
= "{d}1".format(d
=devs
[i
])
633 r
= remote
.run(args
=['sudo', 'umount', run
.Raw(umount_dev
)])
634 cmd
= 'purge ' + nodename
635 execute_cdeploy(admin
, cmd
, path
)
636 cmd
= 'purgedata ' + nodename
637 execute_cdeploy(admin
, cmd
, path
)
638 log
.info("Removing temporary dir")
645 if config
.get('rhbuild'):
646 admin
.run(args
=['sudo', 'yum', 'remove', 'ceph-deploy', '-y'])
649 @contextlib.contextmanager
650 def single_node_test(ctx
, config
):
652 - ceph-deploy.single_node_test: null
655 - ceph-deploy.single_node_test:
659 log
.info("Testing ceph-deploy on single node")
662 overrides
= ctx
.config
.get('overrides', {})
663 teuthology
.deep_merge(config
, overrides
.get('ceph-deploy', {}))
665 if config
.get('rhbuild'):
666 log
.info("RH Build, Skip Download")
667 with contextutil
.nested(
668 lambda: cli_test(ctx
=ctx
, config
=config
),
672 with contextutil
.nested(
673 lambda: install_fn
.ship_utilities(ctx
=ctx
, config
=None),
674 lambda: download_ceph_deploy(ctx
=ctx
, config
=config
),
675 lambda: cli_test(ctx
=ctx
, config
=config
),
680 @contextlib.contextmanager
681 def task(ctx
, config
):
683 Set up and tear down a Ceph cluster.
694 mon_initial_members: 1
697 # either choose bluestore or filestore, default is bluestore
721 separate_journal_disk: yes
727 assert isinstance(config
, dict), \
728 "task ceph-deploy only supports a dictionary for configuration"
730 overrides
= ctx
.config
.get('overrides', {})
731 teuthology
.deep_merge(config
, overrides
.get('ceph-deploy', {}))
733 if config
.get('branch') is not None:
735 config
['branch'], dict), 'branch must be a dictionary'
737 log
.info('task ceph-deploy with config ' + str(config
))
739 with contextutil
.nested(
740 lambda: install_fn
.ship_utilities(ctx
=ctx
, config
=None),
741 lambda: download_ceph_deploy(ctx
=ctx
, config
=config
),
742 lambda: build_ceph_cluster(ctx
=ctx
, config
=config
),