]>
Commit | Line | Data |
---|---|---|
a4b75251 TL |
1 | """ |
2 | Inconsistent_hinfo | |
3 | """ | |
4 | import logging | |
5 | import time | |
6 | from dateutil.parser import parse | |
7 | from tasks import ceph_manager | |
8 | from tasks.util.rados import rados | |
9 | from teuthology import misc as teuthology | |
10 | ||
11 | log = logging.getLogger(__name__) | |
12 | ||
13 | def wait_for_deep_scrub_complete(manager, pgid, check_time_now, inconsistent): | |
14 | log.debug("waiting for pg %s deep-scrub complete (check_time_now=%s)" % | |
15 | (pgid, check_time_now)) | |
16 | for i in range(300): | |
17 | time.sleep(5) | |
18 | manager.flush_pg_stats([0, 1, 2, 3]) | |
19 | pgs = manager.get_pg_stats() | |
20 | pg = next((pg for pg in pgs if pg['pgid'] == pgid), None) | |
21 | log.debug('pg=%s' % pg); | |
22 | assert pg | |
23 | ||
24 | last_deep_scrub_time = parse(pg['last_deep_scrub_stamp']).strftime('%s') | |
25 | if last_deep_scrub_time < check_time_now: | |
26 | log.debug('not scrubbed') | |
27 | continue | |
28 | ||
29 | status = pg['state'].split('+') | |
30 | if inconsistent: | |
31 | assert 'inconsistent' in status | |
32 | else: | |
33 | assert 'inconsistent' not in status | |
34 | return | |
35 | ||
36 | assert False, 'not scrubbed' | |
37 | ||
38 | ||
39 | def wait_for_backfilling_complete(manager, pgid, from_osd, to_osd): | |
40 | log.debug("waiting for pg %s backfill from osd.%s to osd.%s complete" % | |
41 | (pgid, from_osd, to_osd)) | |
42 | for i in range(300): | |
43 | time.sleep(5) | |
44 | manager.flush_pg_stats([0, 1, 2, 3]) | |
45 | pgs = manager.get_pg_stats() | |
46 | pg = next((pg for pg in pgs if pg['pgid'] == pgid), None) | |
47 | log.info('pg=%s' % pg); | |
48 | assert pg | |
49 | status = pg['state'].split('+') | |
50 | if 'active' not in status: | |
51 | log.debug('not active') | |
52 | continue | |
53 | if 'backfilling' in status: | |
54 | assert from_osd in pg['acting'] and to_osd in pg['up'] | |
55 | log.debug('backfilling') | |
56 | continue | |
57 | if to_osd not in pg['up']: | |
58 | log.debug('backfill not started yet') | |
59 | continue | |
60 | log.debug('backfilled!') | |
61 | break | |
62 | ||
63 | def task(ctx, config): | |
64 | """ | |
65 | Test handling of objects with inconsistent hash info during backfill and deep-scrub. | |
66 | ||
67 | A pretty rigid cluster is brought up and tested by this task | |
68 | """ | |
69 | if config is None: | |
70 | config = {} | |
71 | assert isinstance(config, dict), \ | |
72 | 'ec_inconsistent_hinfo task only accepts a dict for configuration' | |
73 | first_mon = teuthology.get_first_mon(ctx, config) | |
74 | (mon,) = ctx.cluster.only(first_mon).remotes.keys() | |
75 | ||
76 | manager = ceph_manager.CephManager( | |
77 | mon, | |
78 | ctx=ctx, | |
79 | logger=log.getChild('ceph_manager'), | |
80 | ) | |
81 | ||
82 | profile = config.get('erasure_code_profile', { | |
83 | 'k': '2', | |
84 | 'm': '1', | |
85 | 'crush-failure-domain': 'osd' | |
86 | }) | |
87 | profile_name = profile.get('name', 'backfill_unfound') | |
88 | manager.create_erasure_code_profile(profile_name, profile) | |
89 | pool = manager.create_pool_with_unique_name( | |
90 | pg_num=1, | |
91 | erasure_code_profile_name=profile_name, | |
92 | min_size=2) | |
93 | manager.raw_cluster_cmd('osd', 'pool', 'set', pool, | |
94 | 'pg_autoscale_mode', 'off') | |
95 | ||
96 | manager.flush_pg_stats([0, 1, 2, 3]) | |
97 | manager.wait_for_clean() | |
98 | ||
99 | pool_id = manager.get_pool_num(pool) | |
100 | pgid = '%d.0' % pool_id | |
101 | pgs = manager.get_pg_stats() | |
102 | acting = next((pg['acting'] for pg in pgs if pg['pgid'] == pgid), None) | |
103 | log.info("acting=%s" % acting) | |
104 | assert acting | |
105 | primary = acting[0] | |
106 | ||
107 | # something that is always there, readable and never empty | |
108 | dummyfile = '/etc/group' | |
109 | ||
110 | # kludge to make sure they get a map | |
111 | rados(ctx, mon, ['-p', pool, 'put', 'dummy', dummyfile]) | |
112 | ||
113 | manager.flush_pg_stats([0, 1]) | |
114 | manager.wait_for_recovery() | |
115 | ||
116 | log.debug("create test object") | |
117 | obj = 'test' | |
118 | rados(ctx, mon, ['-p', pool, 'put', obj, dummyfile]) | |
119 | ||
120 | victim = acting[1] | |
121 | ||
122 | log.info("remove test object hash info from osd.%s shard and test deep-scrub and repair" | |
123 | % victim) | |
124 | ||
125 | manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key', | |
126 | object_name=obj, osd=victim) | |
127 | check_time_now = time.strftime('%s') | |
128 | manager.raw_cluster_cmd('pg', 'deep-scrub', pgid) | |
129 | wait_for_deep_scrub_complete(manager, pgid, check_time_now, True) | |
130 | ||
131 | check_time_now = time.strftime('%s') | |
132 | manager.raw_cluster_cmd('pg', 'repair', pgid) | |
133 | wait_for_deep_scrub_complete(manager, pgid, check_time_now, False) | |
134 | ||
135 | log.info("remove test object hash info from primary osd.%s shard and test backfill" | |
136 | % primary) | |
137 | ||
138 | log.debug("write some data") | |
139 | rados(ctx, mon, ['-p', pool, 'bench', '30', 'write', '-b', '4096', | |
140 | '--no-cleanup']) | |
141 | ||
142 | manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key', | |
143 | object_name=obj, osd=primary) | |
144 | ||
145 | # mark the osd out to trigger a rebalance/backfill | |
146 | source = acting[1] | |
147 | target = [x for x in [0, 1, 2, 3] if x not in acting][0] | |
148 | manager.mark_out_osd(source) | |
149 | ||
150 | # wait for everything to peer, backfill and recover | |
151 | wait_for_backfilling_complete(manager, pgid, source, target) | |
152 | manager.wait_for_clean() | |
153 | ||
154 | manager.flush_pg_stats([0, 1, 2, 3]) | |
155 | pgs = manager.get_pg_stats() | |
156 | pg = next((pg for pg in pgs if pg['pgid'] == pgid), None) | |
157 | log.debug('pg=%s' % pg) | |
158 | assert pg | |
159 | assert 'clean' in pg['state'].split('+') | |
160 | assert 'inconsistent' not in pg['state'].split('+') | |
161 | unfound = manager.get_num_unfound_objects() | |
162 | log.debug("there are %d unfound objects" % unfound) | |
163 | assert unfound == 0 | |
164 | ||
165 | source, target = target, source | |
166 | log.info("remove test object hash info from non-primary osd.%s shard and test backfill" | |
167 | % source) | |
168 | ||
169 | manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key', | |
170 | object_name=obj, osd=source) | |
171 | ||
172 | # mark the osd in to trigger a rebalance/backfill | |
173 | manager.mark_in_osd(target) | |
174 | ||
175 | # wait for everything to peer, backfill and recover | |
176 | wait_for_backfilling_complete(manager, pgid, source, target) | |
177 | manager.wait_for_clean() | |
178 | ||
179 | manager.flush_pg_stats([0, 1, 2, 3]) | |
180 | pgs = manager.get_pg_stats() | |
181 | pg = next((pg for pg in pgs if pg['pgid'] == pgid), None) | |
182 | log.debug('pg=%s' % pg) | |
183 | assert pg | |
184 | assert 'clean' in pg['state'].split('+') | |
185 | assert 'inconsistent' not in pg['state'].split('+') | |
186 | unfound = manager.get_num_unfound_objects() | |
187 | log.debug("there are %d unfound objects" % unfound) | |
188 | assert unfound == 0 | |
189 | ||
190 | log.info("remove hash info from two shards and test backfill") | |
191 | ||
192 | source = acting[2] | |
193 | target = [x for x in [0, 1, 2, 3] if x not in acting][0] | |
194 | manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key', | |
195 | object_name=obj, osd=primary) | |
196 | manager.objectstore_tool(pool, options='', args='rm-attr hinfo_key', | |
197 | object_name=obj, osd=source) | |
198 | ||
199 | # mark the osd out to trigger a rebalance/backfill | |
200 | manager.mark_out_osd(source) | |
201 | ||
202 | # wait for everything to peer, backfill and detect unfound object | |
203 | wait_for_backfilling_complete(manager, pgid, source, target) | |
204 | ||
205 | # verify that there is unfound object | |
206 | manager.flush_pg_stats([0, 1, 2, 3]) | |
207 | pgs = manager.get_pg_stats() | |
208 | pg = next((pg for pg in pgs if pg['pgid'] == pgid), None) | |
209 | log.debug('pg=%s' % pg) | |
210 | assert pg | |
211 | assert 'backfill_unfound' in pg['state'].split('+') | |
212 | unfound = manager.get_num_unfound_objects() | |
213 | log.debug("there are %d unfound objects" % unfound) | |
214 | assert unfound == 1 | |
215 | m = manager.list_pg_unfound(pgid) | |
216 | log.debug('list_pg_unfound=%s' % m) | |
217 | assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound'] | |
218 | ||
219 | # mark stuff lost | |
220 | pgs = manager.get_pg_stats() | |
221 | manager.raw_cluster_cmd('pg', pgid, 'mark_unfound_lost', 'delete') | |
222 | ||
223 | # wait for everything to peer and be happy... | |
224 | manager.flush_pg_stats([0, 1, 2, 3]) | |
225 | manager.wait_for_recovery() |