ceph/qa/tasks/osd_failsafe_enospc.py

   1 """
   2 Handle osdfailsafe configuration settings (nearfull ratio and full ratio)
   3 """
   4 from io import BytesIO
   5 import logging
   6 import six
   7 import time
   8
   9 from teuthology.orchestra import run
  10 from util.rados import rados
  11 from teuthology import misc as teuthology
  12
  13 log = logging.getLogger(__name__)
  14
  15 def task(ctx, config):
  16     """
  17     Test handling of osd_failsafe_nearfull_ratio and osd_failsafe_full_ratio
  18     configuration settings
  19
  20     In order for test to pass must use log-whitelist as follows
  21
  22         tasks:
  23             - chef:
  24             - install:
  25             - ceph:
  26                 log-whitelist: ['OSD near full', 'OSD full dropping all updates']
  27             - osd_failsafe_enospc:
  28
  29     """
  30     if config is None:
  31         config = {}
  32     assert isinstance(config, dict), \
  33         'osd_failsafe_enospc task only accepts a dict for configuration'
  34
  35     # Give 2 seconds for injectargs + osd_op_complaint_time (30) + 2 * osd_heartbeat_interval (6) + 6 padding
  36     sleep_time = 50
  37
  38     # something that is always there
  39     dummyfile = '/etc/fstab'
  40     dummyfile2 = '/etc/resolv.conf'
  41
  42     manager = ctx.managers['ceph']
  43
  44     # create 1 pg pool with 1 rep which can only be on osd.0
  45     osds = manager.get_osd_dump()
  46     for osd in osds:
  47         if osd['osd'] != 0:
  48             manager.mark_out_osd(osd['osd'])
  49
  50     log.info('creating pool foo')
  51     manager.create_pool("foo")
  52     manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'size', '1')
  53
  54     # State NONE -> NEAR
  55     log.info('1. Verify warning messages when exceeding nearfull_ratio')
  56
  57     first_mon = teuthology.get_first_mon(ctx, config)
  58     (mon,) = ctx.cluster.only(first_mon).remotes.keys()
  59
  60     proc = mon.run(
  61              args=[
  62                  'sudo',
  63                  'daemon-helper',
  64                  'kill',
  65                  'ceph', '-w'
  66              ],
  67              stdin=run.PIPE,
  68              stdout=BytesIO(),
  69              wait=False,
  70         )
  71
  72     manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .00001')
  73
  74     time.sleep(sleep_time)
  75     proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
  76     proc.wait()
  77
  78     lines = six.ensure_str(proc.stdout.getvalue()).split('\n')
  79
  80     count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
  81     assert count == 2, 'Incorrect number of warning messages expected 2 got %d' % count
  82     count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
  83     assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
  84
  85     # State NEAR -> FULL
  86     log.info('2. Verify error messages when exceeding full_ratio')
  87
  88     proc = mon.run(
  89              args=[
  90                  'sudo',
  91                  'daemon-helper',
  92                  'kill',
  93                  'ceph', '-w'
  94              ],
  95              stdin=run.PIPE,
  96              stdout=BytesIO(),
  97              wait=False,
  98         )
  99
 100     manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')
 101
 102     time.sleep(sleep_time)
 103     proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
 104     proc.wait()
 105
 106     lines = six.ensure_str(proc.stdout.getvalue()).split('\n')
 107
 108     count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
 109     assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count
 110
 111     log.info('3. Verify write failure when exceeding full_ratio')
 112
 113     # Write data should fail
 114     ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile1', dummyfile])
 115     assert ret != 0, 'Expected write failure but it succeeded with exit status 0'
 116
 117     # Put back default
 118     manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
 119     time.sleep(10)
 120
 121     # State FULL -> NEAR
 122     log.info('4. Verify write success when NOT exceeding full_ratio')
 123
 124     # Write should succeed
 125     ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile2', dummyfile2])
 126     assert ret == 0, 'Expected write to succeed, but got exit status %d' % ret
 127
 128     log.info('5. Verify warning messages again when exceeding nearfull_ratio')
 129
 130     proc = mon.run(
 131              args=[
 132                  'sudo',
 133                  'daemon-helper',
 134                  'kill',
 135                  'ceph', '-w'
 136              ],
 137              stdin=run.PIPE,
 138              stdout=BytesIO(),
 139              wait=False,
 140         )
 141
 142     time.sleep(sleep_time)
 143     proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
 144     proc.wait()
 145
 146     lines = six.ensure_str(proc.stdout.getvalue()).split('\n')
 147
 148     count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
 149     assert count == 1 or count == 2, 'Incorrect number of warning messages expected 1 or 2 got %d' % count
 150     count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
 151     assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
 152
 153     manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .90')
 154     time.sleep(10)
 155
 156     # State NONE -> FULL
 157     log.info('6. Verify error messages again when exceeding full_ratio')
 158
 159     proc = mon.run(
 160              args=[
 161                  'sudo',
 162                  'daemon-helper',
 163                  'kill',
 164                  'ceph', '-w'
 165              ],
 166              stdin=run.PIPE,
 167              stdout=BytesIO(),
 168              wait=False,
 169         )
 170
 171     manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')
 172
 173     time.sleep(sleep_time)
 174     proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
 175     proc.wait()
 176
 177     lines = six.ensure_str(proc.stdout.getvalue()).split('\n')
 178
 179     count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
 180     assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
 181     count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
 182     assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count
 183
 184     # State FULL -> NONE
 185     log.info('7. Verify no messages settings back to default')
 186
 187     manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
 188     time.sleep(10)
 189
 190     proc = mon.run(
 191              args=[
 192                  'sudo',
 193                  'daemon-helper',
 194                  'kill',
 195                  'ceph', '-w'
 196              ],
 197              stdin=run.PIPE,
 198              stdout=BytesIO(),
 199              wait=False,
 200         )
 201
 202     time.sleep(sleep_time)
 203     proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
 204     proc.wait()
 205
 206     lines = six.ensure_str(proc.stdout.getvalue()).split('\n')
 207
 208     count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
 209     assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
 210     count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
 211     assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
 212
 213     log.info('Test Passed')
 214
 215     # Bring all OSDs back in
 216     manager.remove_pool("foo")
 217     for osd in osds:
 218         if osd['osd'] != 0:
 219             manager.mark_in_osd(osd['osd'])