]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/backfill_toofull.py
import quincy beta 17.1.0
[ceph.git] / ceph / qa / tasks / backfill_toofull.py
1 """
2 Backfill_toofull
3 """
4 import logging
5 import time
6 from tasks import ceph_manager
7 from tasks.util.rados import rados
8 from teuthology import misc as teuthology
9
10 log = logging.getLogger(__name__)
11
12 def wait_for_pg_state(manager, pgid, state, to_osd):
13 log.debug("waiting for pg %s state is %s" % (pgid, state))
14 for i in range(300):
15 time.sleep(5)
16 manager.flush_pg_stats([0, 1, 2, 3])
17 pgs = manager.get_pg_stats()
18 pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
19 log.info('pg=%s' % pg);
20 assert pg
21 status = pg['state'].split('+')
22 if 'active' not in status:
23 log.debug('not active')
24 continue
25 if state not in status:
26 log.debug('not %s' % state)
27 continue
28 assert to_osd in pg['up']
29 return
30 assert False, '%s not in %s' % (pgid, state)
31
32
33 def task(ctx, config):
34 """
35 Test backfill reservation calculates "toofull" condition correctly.
36
37 A pretty rigid cluster is brought up and tested by this task
38 """
39 if config is None:
40 config = {}
41 assert isinstance(config, dict), \
42 'backfill_toofull task only accepts a dict for configuration'
43 first_mon = teuthology.get_first_mon(ctx, config)
44 (mon,) = ctx.cluster.only(first_mon).remotes.keys()
45
46 manager = ceph_manager.CephManager(
47 mon,
48 ctx=ctx,
49 logger=log.getChild('ceph_manager'),
50 )
51
52 profile = config.get('erasure_code_profile', {
53 'k': '2',
54 'm': '1',
55 'crush-failure-domain': 'osd'
56 })
57 profile_name = profile.get('name', 'backfill_toofull')
58 manager.create_erasure_code_profile(profile_name, profile)
59 pool = manager.create_pool_with_unique_name(
60 pg_num=1,
61 erasure_code_profile_name=profile_name,
62 min_size=2)
63 manager.raw_cluster_cmd('osd', 'pool', 'set', pool,
64 'pg_autoscale_mode', 'off')
65
66 manager.flush_pg_stats([0, 1, 2, 3])
67 manager.wait_for_clean()
68
69 pool_id = manager.get_pool_num(pool)
70 pgid = '%d.0' % pool_id
71 pgs = manager.get_pg_stats()
72 acting = next((pg['acting'] for pg in pgs if pg['pgid'] == pgid), None)
73 log.debug("acting=%s" % acting)
74 assert acting
75 primary = acting[0]
76 target = acting[1]
77
78 log.debug("write some data")
79 rados(ctx, mon, ['-p', pool, 'bench', '120', 'write', '--no-cleanup'])
80 df = manager.get_osd_df(target)
81 log.debug("target osd df: %s" % df)
82
83 total_kb = df['kb']
84 used_kb = df['kb_used']
85
86 log.debug("pause recovery")
87 manager.raw_cluster_cmd('osd', 'set', 'noout')
88 manager.raw_cluster_cmd('osd', 'set', 'nobackfill')
89 manager.raw_cluster_cmd('osd', 'set', 'norecover')
90
91 log.debug("stop tartget osd %s" % target)
92 manager.kill_osd(target)
93 manager.wait_till_active()
94
95 pgs = manager.get_pg_stats()
96 pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
97 log.debug('pg=%s' % pg)
98 assert pg
99
100 log.debug("re-write data")
101 rados(ctx, mon, ['-p', pool, 'cleanup'])
102 time.sleep(10)
103 rados(ctx, mon, ['-p', pool, 'bench', '60', 'write', '--no-cleanup'])
104
105 df = manager.get_osd_df(primary)
106 log.debug("primary osd df: %s" % df)
107
108 primary_used_kb = df['kb_used']
109
110 log.info("test backfill reservation rejected with toofull")
111
112 # We set backfillfull ratio less than new data size and expect the pg
113 # entering backfill_toofull state.
114 #
115 # We also need to update nearfull ratio to prevent "full ratio(s) out of order".
116
117 backfillfull = 0.9 * primary_used_kb / total_kb
118 nearfull = backfillfull * 0.9
119
120 log.debug("update nearfull ratio to %s and backfillfull ratio to %s" %
121 (nearfull, backfillfull))
122 manager.raw_cluster_cmd('osd', 'set-nearfull-ratio',
123 '{:.3f}'.format(nearfull + 0.001))
124 manager.raw_cluster_cmd('osd', 'set-backfillfull-ratio',
125 '{:.3f}'.format(backfillfull + 0.001))
126
127 log.debug("start tartget osd %s" % target)
128
129 manager.revive_osd(target)
130 manager.wait_for_active()
131 manager.wait_till_osd_is_up(target)
132
133 wait_for_pg_state(manager, pgid, 'backfill_toofull', target)
134
135 log.info("test pg not enter backfill_toofull after restarting backfill")
136
137 # We want to set backfillfull ratio to be big enough for the target to
138 # successfully backfill new data but smaller than the sum of old and new
139 # data, so if the osd backfill reservation incorrectly calculates "toofull"
140 # the test will detect this (fail).
141 #
142 # Note, we need to operate with "uncompressed" bytes because currently
143 # osd backfill reservation does not take compression into account.
144 #
145 # We also need to update nearfull ratio to prevent "full ratio(s) out of order".
146
147 pdf = manager.get_pool_df(pool)
148 log.debug("pool %s df: %s" % (pool, pdf))
149 assert pdf
150 compress_ratio = 1.0 * pdf['compress_under_bytes'] / pdf['compress_bytes_used'] \
151 if pdf['compress_bytes_used'] > 0 else 1.0
152 log.debug("compress_ratio: %s" % compress_ratio)
153
154 backfillfull = (used_kb + primary_used_kb) * compress_ratio / total_kb
155 assert backfillfull < 0.9
156 nearfull_min = max(used_kb, primary_used_kb) * compress_ratio / total_kb
157 assert nearfull_min < backfillfull
158 delta = backfillfull - nearfull_min
159 nearfull = nearfull_min + delta * 0.1
160 backfillfull = nearfull_min + delta * 0.2
161
162 log.debug("update nearfull ratio to %s and backfillfull ratio to %s" %
163 (nearfull, backfillfull))
164 manager.raw_cluster_cmd('osd', 'set-nearfull-ratio',
165 '{:.3f}'.format(nearfull + 0.001))
166 manager.raw_cluster_cmd('osd', 'set-backfillfull-ratio',
167 '{:.3f}'.format(backfillfull + 0.001))
168
169 wait_for_pg_state(manager, pgid, 'backfilling', target)
170
171 pgs = manager.get_pg_stats()
172 pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
173 log.debug('pg=%s' % pg)
174 assert pg
175
176 log.debug("interrupt %s backfill" % target)
177 manager.mark_down_osd(target)
178 # after marking the target osd down it will automatically be
179 # up soon again
180
181 log.debug("resume recovery")
182 manager.raw_cluster_cmd('osd', 'unset', 'noout')
183 manager.raw_cluster_cmd('osd', 'unset', 'nobackfill')
184 manager.raw_cluster_cmd('osd', 'unset', 'norecover')
185
186 # wait for everything to peer, backfill and recover
187 manager.wait_for_clean()
188
189 pgs = manager.get_pg_stats()
190 pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
191 log.info('pg=%s' % pg)
192 assert pg
193 assert 'clean' in pg['state'].split('+')