ceph/qa/tasks/backfill_toofull.py

   1 """
   2 Backfill_toofull
   3 """
   4 import logging
   5 import time
   6 from tasks import ceph_manager
   7 from tasks.util.rados import rados
   8 from teuthology import misc as teuthology
   9
  10 log = logging.getLogger(__name__)
  11
  12 def wait_for_pg_state(manager, pgid, state, to_osd):
  13     log.debug("waiting for pg %s state is %s" % (pgid, state))
  14     for i in range(300):
  15         time.sleep(5)
  16         manager.flush_pg_stats([0, 1, 2, 3])
  17         pgs = manager.get_pg_stats()
  18         pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
  19         log.info('pg=%s' % pg);
  20         assert pg
  21         status = pg['state'].split('+')
  22         if 'active' not in status:
  23             log.debug('not active')
  24             continue
  25         if state not in status:
  26             log.debug('not %s' % state)
  27             continue
  28         assert to_osd in pg['up']
  29         return
  30     assert False, '%s not in %s' % (pgid, state)
  31
  32
  33 def task(ctx, config):
  34     """
  35     Test backfill reservation calculates "toofull" condition correctly.
  36
  37     A pretty rigid cluster is brought up and tested by this task
  38     """
  39     if config is None:
  40         config = {}
  41     assert isinstance(config, dict), \
  42         'backfill_toofull task only accepts a dict for configuration'
  43     first_mon = teuthology.get_first_mon(ctx, config)
  44     (mon,) = ctx.cluster.only(first_mon).remotes.keys()
  45
  46     manager = ceph_manager.CephManager(
  47         mon,
  48         ctx=ctx,
  49         logger=log.getChild('ceph_manager'),
  50         )
  51
  52     profile = config.get('erasure_code_profile', {
  53         'k': '2',
  54         'm': '1',
  55         'crush-failure-domain': 'osd'
  56     })
  57     profile_name = profile.get('name', 'backfill_toofull')
  58     manager.create_erasure_code_profile(profile_name, profile)
  59     pool = manager.create_pool_with_unique_name(
  60         pg_num=1,
  61         erasure_code_profile_name=profile_name,
  62         min_size=2)
  63     manager.raw_cluster_cmd('osd', 'pool', 'set', pool,
  64                             'pg_autoscale_mode', 'off')
  65
  66     manager.flush_pg_stats([0, 1, 2, 3])
  67     manager.wait_for_clean()
  68
  69     pool_id = manager.get_pool_num(pool)
  70     pgid = '%d.0' % pool_id
  71     pgs = manager.get_pg_stats()
  72     acting = next((pg['acting'] for pg in pgs if pg['pgid'] == pgid), None)
  73     log.debug("acting=%s" % acting)
  74     assert acting
  75     primary = acting[0]
  76     target = acting[1]
  77
  78     log.debug("write some data")
  79     rados(ctx, mon, ['-p', pool, 'bench', '120', 'write', '--no-cleanup'])
  80     df = manager.get_osd_df(target)
  81     log.debug("target osd df: %s" % df)
  82
  83     total_kb = df['kb']
  84     used_kb = df['kb_used']
  85
  86     log.debug("pause recovery")
  87     manager.raw_cluster_cmd('osd', 'set', 'noout')
  88     manager.raw_cluster_cmd('osd', 'set', 'nobackfill')
  89     manager.raw_cluster_cmd('osd', 'set', 'norecover')
  90
  91     log.debug("stop tartget osd %s" % target)
  92     manager.kill_osd(target)
  93     manager.wait_till_active()
  94
  95     pgs = manager.get_pg_stats()
  96     pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
  97     log.debug('pg=%s' % pg)
  98     assert pg
  99
 100     log.debug("re-write data")
 101     rados(ctx, mon, ['-p', pool, 'cleanup'])
 102     time.sleep(10)
 103     rados(ctx, mon, ['-p', pool, 'bench', '60', 'write', '--no-cleanup'])
 104
 105     df = manager.get_osd_df(primary)
 106     log.debug("primary osd df: %s" % df)
 107
 108     primary_used_kb = df['kb_used']
 109
 110     log.info("test backfill reservation rejected with toofull")
 111
 112     # We set backfillfull ratio less than new data size and expect the pg
 113     # entering backfill_toofull state.
 114     #
 115     # We also need to update nearfull ratio to prevent "full ratio(s) out of order".
 116
 117     backfillfull = 0.9 * primary_used_kb / total_kb
 118     nearfull = backfillfull * 0.9
 119
 120     log.debug("update nearfull ratio to %s and backfillfull ratio to %s" %
 121               (nearfull, backfillfull))
 122     manager.raw_cluster_cmd('osd', 'set-nearfull-ratio',
 123                             '{:.3f}'.format(nearfull + 0.001))
 124     manager.raw_cluster_cmd('osd', 'set-backfillfull-ratio',
 125                             '{:.3f}'.format(backfillfull + 0.001))
 126
 127     log.debug("start tartget osd %s" % target)
 128
 129     manager.revive_osd(target)
 130     manager.wait_for_active()
 131     manager.wait_till_osd_is_up(target)
 132
 133     wait_for_pg_state(manager, pgid, 'backfill_toofull', target)
 134
 135     log.info("test pg not enter backfill_toofull after restarting backfill")
 136
 137     # We want to set backfillfull ratio to be big enough for the target to
 138     # successfully backfill new data but smaller than the sum of old and new
 139     # data, so if the osd backfill reservation incorrectly calculates "toofull"
 140     # the test will detect this (fail).
 141     #
 142     # Note, we need to operate with "uncompressed" bytes because currently
 143     # osd backfill reservation does not take compression into account.
 144     #
 145     # We also need to update nearfull ratio to prevent "full ratio(s) out of order".
 146
 147     pdf = manager.get_pool_df(pool)
 148     log.debug("pool %s df: %s" % (pool, pdf))
 149     assert pdf
 150     compress_ratio = 1.0 * pdf['compress_under_bytes'] / pdf['compress_bytes_used'] \
 151         if pdf['compress_bytes_used'] > 0 else 1.0
 152     log.debug("compress_ratio: %s" % compress_ratio)
 153
 154     backfillfull = (used_kb + primary_used_kb) * compress_ratio / total_kb
 155     assert backfillfull < 0.9
 156     nearfull_min = max(used_kb, primary_used_kb) * compress_ratio / total_kb
 157     assert nearfull_min < backfillfull
 158     delta = backfillfull - nearfull_min
 159     nearfull = nearfull_min + delta * 0.1
 160     backfillfull = nearfull_min + delta * 0.2
 161
 162     log.debug("update nearfull ratio to %s and backfillfull ratio to %s" %
 163               (nearfull, backfillfull))
 164     manager.raw_cluster_cmd('osd', 'set-nearfull-ratio',
 165                             '{:.3f}'.format(nearfull + 0.001))
 166     manager.raw_cluster_cmd('osd', 'set-backfillfull-ratio',
 167                             '{:.3f}'.format(backfillfull + 0.001))
 168
 169     wait_for_pg_state(manager, pgid, 'backfilling', target)
 170
 171     pgs = manager.get_pg_stats()
 172     pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
 173     log.debug('pg=%s' % pg)
 174     assert pg
 175
 176     log.debug("interrupt %s backfill" % target)
 177     manager.mark_down_osd(target)
 178     # after marking the target osd down it will automatically be
 179     # up soon again
 180
 181     log.debug("resume recovery")
 182     manager.raw_cluster_cmd('osd', 'unset', 'noout')
 183     manager.raw_cluster_cmd('osd', 'unset', 'nobackfill')
 184     manager.raw_cluster_cmd('osd', 'unset', 'norecover')
 185
 186     # wait for everything to peer, backfill and recover
 187     manager.wait_for_clean()
 188
 189     pgs = manager.get_pg_stats()
 190     pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
 191     log.info('pg=%s' % pg)
 192     assert pg
 193     assert 'clean' in pg['state'].split('+')