[ceph.git] / ceph / qa / tasks / osd_failsafe_enospc.py

"""
Handle osdfailsafe configuration settings (nearfull ratio and full ratio)
"""
from io import StringIO
import logging
import time

from teuthology.orchestra import run
from tasks.util.rados import rados
from teuthology import misc as teuthology

log = logging.getLogger(__name__)

def task(ctx, config):
    """
    Test handling of osd_failsafe_nearfull_ratio and osd_failsafe_full_ratio
    configuration settings

    In order for test to pass must use log-ignorelist as follows

        tasks:
            - chef:
            - install:
            - ceph:
                log-ignorelist: ['OSD near full', 'OSD full dropping all updates']
            - osd_failsafe_enospc:

    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'osd_failsafe_enospc task only accepts a dict for configuration'

    # Give 2 seconds for injectargs + osd_op_complaint_time (30) + 2 * osd_heartbeat_interval (6) + 6 padding
    sleep_time = 50

    # something that is always there
    dummyfile = '/etc/fstab'
    dummyfile2 = '/etc/resolv.conf'

    manager = ctx.managers['ceph']

    # create 1 pg pool with 1 rep which can only be on osd.0
    osds = manager.get_osd_dump()
    for osd in osds:
        if osd['osd'] != 0:
            manager.mark_out_osd(osd['osd'])

    log.info('creating pool foo')
    manager.create_pool("foo")
    manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'size', '1')

    # State NONE -> NEAR
    log.info('1. Verify warning messages when exceeding nearfull_ratio')

    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.keys()

    proc = mon.run(
             args=[
                 'sudo',
                 'daemon-helper',
                 'kill',
                 'ceph', '-w'
             ],
             stdin=run.PIPE,
             stdout=StringIO(),
             wait=False,
        )

    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .00001')

    time.sleep(sleep_time)
    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
    proc.wait()

    lines = proc.stdout.getvalue().split('\n')

    count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
    assert count == 2, 'Incorrect number of warning messages expected 2 got %d' % count
    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
    assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count

    # State NEAR -> FULL
    log.info('2. Verify error messages when exceeding full_ratio')

    proc = mon.run(
             args=[
                 'sudo',
                 'daemon-helper',
                 'kill',
                 'ceph', '-w'
             ],
             stdin=run.PIPE,
             stdout=StringIO(),
             wait=False,
        )

    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')

    time.sleep(sleep_time)
    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
    proc.wait()

    lines = proc.stdout.getvalue().split('\n')

    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
    assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count

    log.info('3. Verify write failure when exceeding full_ratio')

    # Write data should fail
    ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile1', dummyfile])
    assert ret != 0, 'Expected write failure but it succeeded with exit status 0'

    # Put back default
    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
    time.sleep(10)

    # State FULL -> NEAR
    log.info('4. Verify write success when NOT exceeding full_ratio')

    # Write should succeed
    ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile2', dummyfile2])
    assert ret == 0, 'Expected write to succeed, but got exit status %d' % ret

    log.info('5. Verify warning messages again when exceeding nearfull_ratio')

    proc = mon.run(
             args=[
                 'sudo',
                 'daemon-helper',
                 'kill',
                 'ceph', '-w'
             ],
             stdin=run.PIPE,
             stdout=StringIO(),
             wait=False,
        )

    time.sleep(sleep_time)
    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
    proc.wait()

    lines = proc.stdout.getvalue().split('\n')

    count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
    assert count == 1 or count == 2, 'Incorrect number of warning messages expected 1 or 2 got %d' % count
    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
    assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count

    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .90')
    time.sleep(10)

    # State NONE -> FULL
    log.info('6. Verify error messages again when exceeding full_ratio')

    proc = mon.run(
             args=[
                 'sudo',
                 'daemon-helper',
                 'kill',
                 'ceph', '-w'
             ],
             stdin=run.PIPE,
             stdout=StringIO(),
             wait=False,
        )

    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')

    time.sleep(sleep_time)
    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
    proc.wait()

    lines = proc.stdout.getvalue().split('\n')

    count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
    assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
    assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count

    # State FULL -> NONE
    log.info('7. Verify no messages settings back to default')

    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
    time.sleep(10)

    proc = mon.run(
             args=[
                 'sudo',
                 'daemon-helper',
                 'kill',
                 'ceph', '-w'
             ],
             stdin=run.PIPE,
             stdout=StringIO(),
             wait=False,
        )

    time.sleep(sleep_time)
    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
    proc.wait()

    lines = proc.stdout.getvalue().split('\n')

    count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
    assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
    assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count

    log.info('Test Passed')

    # Bring all OSDs back in
    manager.remove_pool("foo")
    for osd in osds:
        if osd['osd'] != 0:
            manager.mark_in_osd(osd['osd'])
Commit	Line	Data
7c673cae FG	1	"""
	2	Handle osdfailsafe configuration settings (nearfull ratio and full ratio)
	3	"""
f67539c2	4	from io import StringIO
7c673cae FG	5	import logging
	6	import time
	7
	8	from teuthology.orchestra import run
e306af50	9	from tasks.util.rados import rados
7c673cae FG	10	from teuthology import misc as teuthology
	11
	12	log = logging.getLogger(__name__)
	13
	14	def task(ctx, config):
	15	"""
	16	Test handling of osd_failsafe_nearfull_ratio and osd_failsafe_full_ratio
	17	configuration settings
	18
cd265ab1	19	In order for test to pass must use log-ignorelist as follows
7c673cae FG	20
	21	tasks:
	22	- chef:
	23	- install:
	24	- ceph:
cd265ab1	25	log-ignorelist: ['OSD near full', 'OSD full dropping all updates']
7c673cae FG	26	- osd_failsafe_enospc:
	27
	28	"""
	29	if config is None:
	30	config = {}
	31	assert isinstance(config, dict), \
	32	'osd_failsafe_enospc task only accepts a dict for configuration'
	33
	34	# Give 2 seconds for injectargs + osd_op_complaint_time (30) + 2 * osd_heartbeat_interval (6) + 6 padding
	35	sleep_time = 50
	36
	37	# something that is always there
	38	dummyfile = '/etc/fstab'
	39	dummyfile2 = '/etc/resolv.conf'
	40
	41	manager = ctx.managers['ceph']
	42
	43	# create 1 pg pool with 1 rep which can only be on osd.0
	44	osds = manager.get_osd_dump()
	45	for osd in osds:
	46	if osd['osd'] != 0:
	47	manager.mark_out_osd(osd['osd'])
	48
	49	log.info('creating pool foo')
	50	manager.create_pool("foo")
	51	manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'size', '1')
	52
	53	# State NONE -> NEAR
	54	log.info('1. Verify warning messages when exceeding nearfull_ratio')
	55
	56	first_mon = teuthology.get_first_mon(ctx, config)
9f95a23c	57	(mon,) = ctx.cluster.only(first_mon).remotes.keys()
7c673cae FG	58
	59	proc = mon.run(
	60	args=[
	61	'sudo',
	62	'daemon-helper',
	63	'kill',
	64	'ceph', '-w'
	65	],
	66	stdin=run.PIPE,
f67539c2	67	stdout=StringIO(),
7c673cae FG	68	wait=False,
	69	)
	70
	71	manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .00001')
	72
	73	time.sleep(sleep_time)
	74	proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
	75	proc.wait()
	76
f67539c2	77	lines = proc.stdout.getvalue().split('\n')
7c673cae FG	78
	79	count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
	80	assert count == 2, 'Incorrect number of warning messages expected 2 got %d' % count
	81	count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
	82	assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
	83
	84	# State NEAR -> FULL
	85	log.info('2. Verify error messages when exceeding full_ratio')
	86
	87	proc = mon.run(
	88	args=[
	89	'sudo',
	90	'daemon-helper',
	91	'kill',
	92	'ceph', '-w'
	93	],
	94	stdin=run.PIPE,
f67539c2	95	stdout=StringIO(),
7c673cae FG	96	wait=False,
	97	)
	98
	99	manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')
	100
	101	time.sleep(sleep_time)
	102	proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
	103	proc.wait()
	104
f67539c2	105	lines = proc.stdout.getvalue().split('\n')
7c673cae FG	106
	107	count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
	108	assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count
	109
	110	log.info('3. Verify write failure when exceeding full_ratio')
	111
	112	# Write data should fail
	113	ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile1', dummyfile])
	114	assert ret != 0, 'Expected write failure but it succeeded with exit status 0'
	115
	116	# Put back default
	117	manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
	118	time.sleep(10)
	119
	120	# State FULL -> NEAR
	121	log.info('4. Verify write success when NOT exceeding full_ratio')
	122
	123	# Write should succeed
	124	ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile2', dummyfile2])
	125	assert ret == 0, 'Expected write to succeed, but got exit status %d' % ret
	126
	127	log.info('5. Verify warning messages again when exceeding nearfull_ratio')
	128
	129	proc = mon.run(
	130	args=[
	131	'sudo',
	132	'daemon-helper',
	133	'kill',
	134	'ceph', '-w'
	135	],
	136	stdin=run.PIPE,
f67539c2	137	stdout=StringIO(),
7c673cae FG	138	wait=False,
	139	)
	140
	141	time.sleep(sleep_time)
	142	proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
	143	proc.wait()
	144
f67539c2	145	lines = proc.stdout.getvalue().split('\n')
7c673cae FG	146
	147	count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
	148	assert count == 1 or count == 2, 'Incorrect number of warning messages expected 1 or 2 got %d' % count
	149	count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
	150	assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
	151
	152	manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .90')
	153	time.sleep(10)
	154
	155	# State NONE -> FULL
	156	log.info('6. Verify error messages again when exceeding full_ratio')
	157
	158	proc = mon.run(
	159	args=[
	160	'sudo',
	161	'daemon-helper',
	162	'kill',
	163	'ceph', '-w'
	164	],
	165	stdin=run.PIPE,
f67539c2	166	stdout=StringIO(),
7c673cae FG	167	wait=False,
	168	)
	169
	170	manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')
	171
	172	time.sleep(sleep_time)
	173	proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
	174	proc.wait()
	175
f67539c2	176	lines = proc.stdout.getvalue().split('\n')
7c673cae FG	177
	178	count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
	179	assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
	180	count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
	181	assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count
	182
	183	# State FULL -> NONE
	184	log.info('7. Verify no messages settings back to default')
	185
	186	manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
	187	time.sleep(10)
	188
	189	proc = mon.run(
	190	args=[
	191	'sudo',
	192	'daemon-helper',
	193	'kill',
	194	'ceph', '-w'
	195	],
	196	stdin=run.PIPE,
f67539c2	197	stdout=StringIO(),
7c673cae FG	198	wait=False,
	199	)
	200
	201	time.sleep(sleep_time)
	202	proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
	203	proc.wait()
	204
f67539c2	205	lines = proc.stdout.getvalue().split('\n')
7c673cae FG	206
	207	count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
	208	assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
	209	count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
	210	assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
	211
	212	log.info('Test Passed')
	213
	214	# Bring all OSDs back in
	215	manager.remove_pool("foo")
	216	for osd in osds:
	217	if osd['osd'] != 0:
	218	manager.mark_in_osd(osd['osd'])