]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/ec_inconsistent_hinfo.py
import ceph 16.2.7
[ceph.git] / ceph / qa / tasks / ec_inconsistent_hinfo.py
CommitLineData
a4b75251
TL
1"""
2Inconsistent_hinfo
3"""
4import logging
5import time
6from dateutil.parser import parse
7from tasks import ceph_manager
8from tasks.util.rados import rados
9from teuthology import misc as teuthology
10
11log = logging.getLogger(__name__)
12
13def wait_for_deep_scrub_complete(manager, pgid, check_time_now, inconsistent):
14 log.debug("waiting for pg %s deep-scrub complete (check_time_now=%s)" %
15 (pgid, check_time_now))
16 for i in range(300):
17 time.sleep(5)
18 manager.flush_pg_stats([0, 1, 2, 3])
19 pgs = manager.get_pg_stats()
20 pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
21 log.debug('pg=%s' % pg);
22 assert pg
23
24 last_deep_scrub_time = parse(pg['last_deep_scrub_stamp']).strftime('%s')
25 if last_deep_scrub_time < check_time_now:
26 log.debug('not scrubbed')
27 continue
28
29 status = pg['state'].split('+')
30 if inconsistent:
31 assert 'inconsistent' in status
32 else:
33 assert 'inconsistent' not in status
34 return
35
36 assert False, 'not scrubbed'
37
38
39def wait_for_backfilling_complete(manager, pgid, from_osd, to_osd):
40 log.debug("waiting for pg %s backfill from osd.%s to osd.%s complete" %
41 (pgid, from_osd, to_osd))
42 for i in range(300):
43 time.sleep(5)
44 manager.flush_pg_stats([0, 1, 2, 3])
45 pgs = manager.get_pg_stats()
46 pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
47 log.info('pg=%s' % pg);
48 assert pg
49 status = pg['state'].split('+')
50 if 'active' not in status:
51 log.debug('not active')
52 continue
53 if 'backfilling' in status:
54 assert from_osd in pg['acting'] and to_osd in pg['up']
55 log.debug('backfilling')
56 continue
57 if to_osd not in pg['up']:
58 log.debug('backfill not started yet')
59 continue
60 log.debug('backfilled!')
61 break
62
63def task(ctx, config):
64 """
65 Test handling of objects with inconsistent hash info during backfill and deep-scrub.
66
67 A pretty rigid cluster is brought up and tested by this task
68 """
69 if config is None:
70 config = {}
71 assert isinstance(config, dict), \
72 'ec_inconsistent_hinfo task only accepts a dict for configuration'
73 first_mon = teuthology.get_first_mon(ctx, config)
74 (mon,) = ctx.cluster.only(first_mon).remotes.keys()
75
76 manager = ceph_manager.CephManager(
77 mon,
78 ctx=ctx,
79 logger=log.getChild('ceph_manager'),
80 )
81
82 profile = config.get('erasure_code_profile', {
83 'k': '2',
84 'm': '1',
85 'crush-failure-domain': 'osd'
86 })
87 profile_name = profile.get('name', 'backfill_unfound')
88 manager.create_erasure_code_profile(profile_name, profile)
89 pool = manager.create_pool_with_unique_name(
90 pg_num=1,
91 erasure_code_profile_name=profile_name,
92 min_size=2)
93 manager.raw_cluster_cmd('osd', 'pool', 'set', pool,
94 'pg_autoscale_mode', 'off')
95
96 manager.flush_pg_stats([0, 1, 2, 3])
97 manager.wait_for_clean()
98
99 pool_id = manager.get_pool_num(pool)
100 pgid = '%d.0' % pool_id
101 pgs = manager.get_pg_stats()
102 acting = next((pg['acting'] for pg in pgs if pg['pgid'] == pgid), None)
103 log.info("acting=%s" % acting)
104 assert acting
105 primary = acting[0]
106
107 # something that is always there, readable and never empty
108 dummyfile = '/etc/group'
109
110 # kludge to make sure they get a map
111 rados(ctx, mon, ['-p', pool, 'put', 'dummy', dummyfile])
112
113 manager.flush_pg_stats([0, 1])
114 manager.wait_for_recovery()
115
116 log.debug("create test object")
117 obj = 'test'
118 rados(ctx, mon, ['-p', pool, 'put', obj, dummyfile])
119
120 victim = acting[1]
121
122 log.info("remove test object hash info from osd.%s shard and test deep-scrub and repair"
123 % victim)
124
125 manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
126 object_name=obj, osd=victim)
127 check_time_now = time.strftime('%s')
128 manager.raw_cluster_cmd('pg', 'deep-scrub', pgid)
129 wait_for_deep_scrub_complete(manager, pgid, check_time_now, True)
130
131 check_time_now = time.strftime('%s')
132 manager.raw_cluster_cmd('pg', 'repair', pgid)
133 wait_for_deep_scrub_complete(manager, pgid, check_time_now, False)
134
135 log.info("remove test object hash info from primary osd.%s shard and test backfill"
136 % primary)
137
138 log.debug("write some data")
139 rados(ctx, mon, ['-p', pool, 'bench', '30', 'write', '-b', '4096',
140 '--no-cleanup'])
141
142 manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
143 object_name=obj, osd=primary)
144
145 # mark the osd out to trigger a rebalance/backfill
146 source = acting[1]
147 target = [x for x in [0, 1, 2, 3] if x not in acting][0]
148 manager.mark_out_osd(source)
149
150 # wait for everything to peer, backfill and recover
151 wait_for_backfilling_complete(manager, pgid, source, target)
152 manager.wait_for_clean()
153
154 manager.flush_pg_stats([0, 1, 2, 3])
155 pgs = manager.get_pg_stats()
156 pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
157 log.debug('pg=%s' % pg)
158 assert pg
159 assert 'clean' in pg['state'].split('+')
160 assert 'inconsistent' not in pg['state'].split('+')
161 unfound = manager.get_num_unfound_objects()
162 log.debug("there are %d unfound objects" % unfound)
163 assert unfound == 0
164
165 source, target = target, source
166 log.info("remove test object hash info from non-primary osd.%s shard and test backfill"
167 % source)
168
169 manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
170 object_name=obj, osd=source)
171
172 # mark the osd in to trigger a rebalance/backfill
173 manager.mark_in_osd(target)
174
175 # wait for everything to peer, backfill and recover
176 wait_for_backfilling_complete(manager, pgid, source, target)
177 manager.wait_for_clean()
178
179 manager.flush_pg_stats([0, 1, 2, 3])
180 pgs = manager.get_pg_stats()
181 pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
182 log.debug('pg=%s' % pg)
183 assert pg
184 assert 'clean' in pg['state'].split('+')
185 assert 'inconsistent' not in pg['state'].split('+')
186 unfound = manager.get_num_unfound_objects()
187 log.debug("there are %d unfound objects" % unfound)
188 assert unfound == 0
189
190 log.info("remove hash info from two shards and test backfill")
191
192 source = acting[2]
193 target = [x for x in [0, 1, 2, 3] if x not in acting][0]
194 manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
195 object_name=obj, osd=primary)
196 manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key',
197 object_name=obj, osd=source)
198
199 # mark the osd out to trigger a rebalance/backfill
200 manager.mark_out_osd(source)
201
202 # wait for everything to peer, backfill and detect unfound object
203 wait_for_backfilling_complete(manager, pgid, source, target)
204
205 # verify that there is unfound object
206 manager.flush_pg_stats([0, 1, 2, 3])
207 pgs = manager.get_pg_stats()
208 pg = next((pg for pg in pgs if pg['pgid'] == pgid), None)
209 log.debug('pg=%s' % pg)
210 assert pg
211 assert 'backfill_unfound' in pg['state'].split('+')
212 unfound = manager.get_num_unfound_objects()
213 log.debug("there are %d unfound objects" % unfound)
214 assert unfound == 1
215 m = manager.list_pg_unfound(pgid)
216 log.debug('list_pg_unfound=%s' % m)
217 assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound']
218
219 # mark stuff lost
220 pgs = manager.get_pg_stats()
221 manager.raw_cluster_cmd('pg', pgid, 'mark_unfound_lost', 'delete')
222
223 # wait for everything to peer and be happy...
224 manager.flush_pg_stats([0, 1, 2, 3])
225 manager.wait_for_recovery()