[ceph.git] / ceph / qa / tasks / osd_failsafe_enospc.py

"""
Handle osdfailsafe configuration settings (nearfull ratio and full ratio)
"""
from io import BytesIO
import logging
import six
import time

from teuthology.orchestra import run
from tasks.util.rados import rados
from teuthology import misc as teuthology

log = logging.getLogger(__name__)

def task(ctx, config):
    """
    Test handling of osd_failsafe_nearfull_ratio and osd_failsafe_full_ratio
    configuration settings

    In order for test to pass must use log-ignorelist as follows

        tasks:
            - chef:
            - install:
            - ceph:
                log-ignorelist: ['OSD near full', 'OSD full dropping all updates']
            - osd_failsafe_enospc:

    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'osd_failsafe_enospc task only accepts a dict for configuration'

    # Give 2 seconds for injectargs + osd_op_complaint_time (30) + 2 * osd_heartbeat_interval (6) + 6 padding
    sleep_time = 50

    # something that is always there
    dummyfile = '/etc/fstab'
    dummyfile2 = '/etc/resolv.conf'

    manager = ctx.managers['ceph']

    # create 1 pg pool with 1 rep which can only be on osd.0
    osds = manager.get_osd_dump()
    for osd in osds:
        if osd['osd'] != 0:
            manager.mark_out_osd(osd['osd'])

    log.info('creating pool foo')
    manager.create_pool("foo")
    manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'size', '1')

    # State NONE -> NEAR
    log.info('1. Verify warning messages when exceeding nearfull_ratio')

    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.keys()

    proc = mon.run(
             args=[
                 'sudo',
                 'daemon-helper',
                 'kill',
                 'ceph', '-w'
             ],
             stdin=run.PIPE,
             stdout=BytesIO(),
             wait=False,
        )

    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .00001')

    time.sleep(sleep_time)
    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
    proc.wait()

    lines = six.ensure_str(proc.stdout.getvalue()).split('\n')

    count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
    assert count == 2, 'Incorrect number of warning messages expected 2 got %d' % count
    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
    assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count

    # State NEAR -> FULL
    log.info('2. Verify error messages when exceeding full_ratio')

    proc = mon.run(
             args=[
                 'sudo',
                 'daemon-helper',
                 'kill',
                 'ceph', '-w'
             ],
             stdin=run.PIPE,
             stdout=BytesIO(),
             wait=False,
        )

    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')

    time.sleep(sleep_time)
    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
    proc.wait()

    lines = six.ensure_str(proc.stdout.getvalue()).split('\n')

    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
    assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count

    log.info('3. Verify write failure when exceeding full_ratio')

    # Write data should fail
    ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile1', dummyfile])
    assert ret != 0, 'Expected write failure but it succeeded with exit status 0'

    # Put back default
    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
    time.sleep(10)

    # State FULL -> NEAR
    log.info('4. Verify write success when NOT exceeding full_ratio')

    # Write should succeed
    ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile2', dummyfile2])
    assert ret == 0, 'Expected write to succeed, but got exit status %d' % ret

    log.info('5. Verify warning messages again when exceeding nearfull_ratio')

    proc = mon.run(
             args=[
                 'sudo',
                 'daemon-helper',
                 'kill',
                 'ceph', '-w'
             ],
             stdin=run.PIPE,
             stdout=BytesIO(),
             wait=False,
        )

    time.sleep(sleep_time)
    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
    proc.wait()

    lines = six.ensure_str(proc.stdout.getvalue()).split('\n')

    count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
    assert count == 1 or count == 2, 'Incorrect number of warning messages expected 1 or 2 got %d' % count
    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
    assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count

    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .90')
    time.sleep(10)

    # State NONE -> FULL
    log.info('6. Verify error messages again when exceeding full_ratio')

    proc = mon.run(
             args=[
                 'sudo',
                 'daemon-helper',
                 'kill',
                 'ceph', '-w'
             ],
             stdin=run.PIPE,
             stdout=BytesIO(),
             wait=False,
        )

    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')

    time.sleep(sleep_time)
    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
    proc.wait()

    lines = six.ensure_str(proc.stdout.getvalue()).split('\n')

    count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
    assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
    assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count

    # State FULL -> NONE
    log.info('7. Verify no messages settings back to default')

    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
    time.sleep(10)

    proc = mon.run(
             args=[
                 'sudo',
                 'daemon-helper',
                 'kill',
                 'ceph', '-w'
             ],
             stdin=run.PIPE,
             stdout=BytesIO(),
             wait=False,
        )

    time.sleep(sleep_time)
    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
    proc.wait()

    lines = six.ensure_str(proc.stdout.getvalue()).split('\n')

    count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
    assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
    assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count

    log.info('Test Passed')

    # Bring all OSDs back in
    manager.remove_pool("foo")
    for osd in osds:
        if osd['osd'] != 0:
            manager.mark_in_osd(osd['osd'])
Commit	Line	Data
7c673cae FG	1	"""
	2	Handle osdfailsafe configuration settings (nearfull ratio and full ratio)
	3	"""
9f95a23c	4	from io import BytesIO
7c673cae	5	import logging
9f95a23c	6	import six
7c673cae FG	7	import time
	8
	9	from teuthology.orchestra import run
e306af50	10	from tasks.util.rados import rados
7c673cae FG	11	from teuthology import misc as teuthology
	12
	13	log = logging.getLogger(__name__)
	14
	15	def task(ctx, config):
	16	"""
	17	Test handling of osd_failsafe_nearfull_ratio and osd_failsafe_full_ratio
	18	configuration settings
	19
cd265ab1	20	In order for test to pass must use log-ignorelist as follows
7c673cae FG	21
	22	tasks:
	23	- chef:
	24	- install:
	25	- ceph:
cd265ab1	26	log-ignorelist: ['OSD near full', 'OSD full dropping all updates']
7c673cae FG	27	- osd_failsafe_enospc:
	28
	29	"""
	30	if config is None:
	31	config = {}
	32	assert isinstance(config, dict), \
	33	'osd_failsafe_enospc task only accepts a dict for configuration'
	34
	35	# Give 2 seconds for injectargs + osd_op_complaint_time (30) + 2 * osd_heartbeat_interval (6) + 6 padding
	36	sleep_time = 50
	37
	38	# something that is always there
	39	dummyfile = '/etc/fstab'
	40	dummyfile2 = '/etc/resolv.conf'
	41
	42	manager = ctx.managers['ceph']
	43
	44	# create 1 pg pool with 1 rep which can only be on osd.0
	45	osds = manager.get_osd_dump()
	46	for osd in osds:
	47	if osd['osd'] != 0:
	48	manager.mark_out_osd(osd['osd'])
	49
	50	log.info('creating pool foo')
	51	manager.create_pool("foo")
	52	manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'size', '1')
	53
	54	# State NONE -> NEAR
	55	log.info('1. Verify warning messages when exceeding nearfull_ratio')
	56
	57	first_mon = teuthology.get_first_mon(ctx, config)
9f95a23c	58	(mon,) = ctx.cluster.only(first_mon).remotes.keys()
7c673cae FG	59
	60	proc = mon.run(
	61	args=[
	62	'sudo',
	63	'daemon-helper',
	64	'kill',
	65	'ceph', '-w'
	66	],
	67	stdin=run.PIPE,
9f95a23c	68	stdout=BytesIO(),
7c673cae FG	69	wait=False,
	70	)
	71
	72	manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .00001')
	73
	74	time.sleep(sleep_time)
	75	proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
	76	proc.wait()
	77
9f95a23c	78	lines = six.ensure_str(proc.stdout.getvalue()).split('\n')
7c673cae FG	79
	80	count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
	81	assert count == 2, 'Incorrect number of warning messages expected 2 got %d' % count
	82	count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
	83	assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
	84
	85	# State NEAR -> FULL
	86	log.info('2. Verify error messages when exceeding full_ratio')
	87
	88	proc = mon.run(
	89	args=[
	90	'sudo',
	91	'daemon-helper',
	92	'kill',
	93	'ceph', '-w'
	94	],
	95	stdin=run.PIPE,
9f95a23c	96	stdout=BytesIO(),
7c673cae FG	97	wait=False,
	98	)
	99
	100	manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')
	101
	102	time.sleep(sleep_time)
	103	proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
	104	proc.wait()
	105
9f95a23c	106	lines = six.ensure_str(proc.stdout.getvalue()).split('\n')
7c673cae FG	107
	108	count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
	109	assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count
	110
	111	log.info('3. Verify write failure when exceeding full_ratio')
	112
	113	# Write data should fail
	114	ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile1', dummyfile])
	115	assert ret != 0, 'Expected write failure but it succeeded with exit status 0'
	116
	117	# Put back default
	118	manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
	119	time.sleep(10)
	120
	121	# State FULL -> NEAR
	122	log.info('4. Verify write success when NOT exceeding full_ratio')
	123
	124	# Write should succeed
	125	ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile2', dummyfile2])
	126	assert ret == 0, 'Expected write to succeed, but got exit status %d' % ret
	127
	128	log.info('5. Verify warning messages again when exceeding nearfull_ratio')
	129
	130	proc = mon.run(
	131	args=[
	132	'sudo',
	133	'daemon-helper',
	134	'kill',
	135	'ceph', '-w'
	136	],
	137	stdin=run.PIPE,
9f95a23c	138	stdout=BytesIO(),
7c673cae FG	139	wait=False,
	140	)
	141
	142	time.sleep(sleep_time)
	143	proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
	144	proc.wait()
	145
9f95a23c	146	lines = six.ensure_str(proc.stdout.getvalue()).split('\n')
7c673cae FG	147
	148	count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
	149	assert count == 1 or count == 2, 'Incorrect number of warning messages expected 1 or 2 got %d' % count
	150	count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
	151	assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
	152
	153	manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .90')
	154	time.sleep(10)
	155
	156	# State NONE -> FULL
	157	log.info('6. Verify error messages again when exceeding full_ratio')
	158
	159	proc = mon.run(
	160	args=[
	161	'sudo',
	162	'daemon-helper',
	163	'kill',
	164	'ceph', '-w'
	165	],
	166	stdin=run.PIPE,
9f95a23c	167	stdout=BytesIO(),
7c673cae FG	168	wait=False,
	169	)
	170
	171	manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')
	172
	173	time.sleep(sleep_time)
	174	proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
	175	proc.wait()
	176
9f95a23c	177	lines = six.ensure_str(proc.stdout.getvalue()).split('\n')
7c673cae FG	178
	179	count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
	180	assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
	181	count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
	182	assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count
	183
	184	# State FULL -> NONE
	185	log.info('7. Verify no messages settings back to default')
	186
	187	manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
	188	time.sleep(10)
	189
	190	proc = mon.run(
	191	args=[
	192	'sudo',
	193	'daemon-helper',
	194	'kill',
	195	'ceph', '-w'
	196	],
	197	stdin=run.PIPE,
9f95a23c	198	stdout=BytesIO(),
7c673cae FG	199	wait=False,
	200	)
	201
	202	time.sleep(sleep_time)
	203	proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
	204	proc.wait()
	205
9f95a23c	206	lines = six.ensure_str(proc.stdout.getvalue()).split('\n')
7c673cae FG	207
	208	count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
	209	assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
	210	count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
	211	assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
	212
	213	log.info('Test Passed')
	214
	215	# Bring all OSDs back in
	216	manager.remove_pool("foo")
	217	for osd in osds:
	218	if osd['osd'] != 0:
	219	manager.mark_in_osd(osd['osd'])