]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | """ |
2 | Test pool repairing after objects are damaged. | |
3 | """ | |
4 | import logging | |
5 | import time | |
6 | ||
7 | from teuthology import misc as teuthology | |
8 | ||
9 | log = logging.getLogger(__name__) | |
10 | ||
11 | ||
12 | def choose_primary(manager, pool, num): | |
13 | """ | |
14 | Return primary to test on. | |
15 | """ | |
16 | log.info("Choosing primary") | |
17 | return manager.get_pg_primary(pool, num) | |
18 | ||
19 | ||
20 | def choose_replica(manager, pool, num): | |
21 | """ | |
22 | Return replica to test on. | |
23 | """ | |
24 | log.info("Choosing replica") | |
25 | return manager.get_pg_replica(pool, num) | |
26 | ||
27 | ||
28 | def trunc(manager, osd, pool, obj): | |
29 | """ | |
30 | truncate an object | |
31 | """ | |
32 | log.info("truncating object") | |
33 | return manager.osd_admin_socket( | |
34 | osd, | |
35 | ['truncobj', pool, obj, '1']) | |
36 | ||
37 | ||
38 | def dataerr(manager, osd, pool, obj): | |
39 | """ | |
40 | cause an error in the data | |
41 | """ | |
42 | log.info("injecting data err on object") | |
43 | return manager.osd_admin_socket( | |
44 | osd, | |
45 | ['injectdataerr', pool, obj]) | |
46 | ||
47 | ||
48 | def mdataerr(manager, osd, pool, obj): | |
49 | """ | |
50 | cause an error in the mdata | |
51 | """ | |
52 | log.info("injecting mdata err on object") | |
53 | return manager.osd_admin_socket( | |
54 | osd, | |
55 | ['injectmdataerr', pool, obj]) | |
56 | ||
57 | ||
58 | def omaperr(manager, osd, pool, obj): | |
59 | """ | |
60 | Cause an omap error. | |
61 | """ | |
62 | log.info("injecting omap err on object") | |
63 | return manager.osd_admin_socket(osd, ['setomapval', pool, obj, | |
64 | 'badkey', 'badval']) | |
65 | ||
66 | ||
67 | def repair_test_1(manager, corrupter, chooser, scrub_type): | |
68 | """ | |
69 | Creates an object in the pool, corrupts it, | |
70 | scrubs it, and verifies that the pool is inconsistent. It then repairs | |
71 | the pool, rescrubs it, and verifies that the pool is consistent | |
72 | ||
73 | :param corrupter: error generating function (truncate, data-error, or | |
74 | meta-data error, for example). | |
75 | :param chooser: osd type chooser (primary or replica) | |
76 | :param scrub_type: regular scrub or deep-scrub | |
77 | """ | |
78 | pool = "repair_pool_1" | |
79 | manager.wait_for_clean() | |
80 | with manager.pool(pool, 1): | |
81 | ||
82 | log.info("starting repair test type 1") | |
83 | victim_osd = chooser(manager, pool, 0) | |
84 | ||
85 | # create object | |
86 | log.info("doing put") | |
87 | manager.do_put(pool, 'repair_test_obj', '/etc/hosts') | |
88 | ||
89 | # corrupt object | |
90 | log.info("corrupting object") | |
91 | corrupter(manager, victim_osd, pool, 'repair_test_obj') | |
92 | ||
93 | # verify inconsistent | |
94 | log.info("scrubbing") | |
95 | manager.do_pg_scrub(pool, 0, scrub_type) | |
96 | ||
97 | manager.with_pg_state(pool, 0, lambda s: 'inconsistent' in s) | |
98 | ||
99 | # repair | |
100 | log.info("repairing") | |
101 | manager.do_pg_scrub(pool, 0, "repair") | |
102 | ||
103 | log.info("re-scrubbing") | |
104 | manager.do_pg_scrub(pool, 0, scrub_type) | |
105 | ||
106 | # verify consistent | |
107 | manager.with_pg_state(pool, 0, lambda s: 'inconsistent' not in s) | |
108 | log.info("done") | |
109 | ||
110 | ||
111 | def repair_test_2(ctx, manager, config, chooser): | |
112 | """ | |
113 | First creates a set of objects and | |
114 | sets the omap value. It then corrupts an object, does both a scrub | |
115 | and a deep-scrub, and then corrupts more objects. After that, it | |
116 | repairs the pool and makes sure that the pool is consistent some | |
117 | time after a deep-scrub. | |
118 | ||
119 | :param chooser: primary or replica selection routine. | |
120 | """ | |
121 | pool = "repair_pool_2" | |
122 | manager.wait_for_clean() | |
123 | with manager.pool(pool, 1): | |
124 | log.info("starting repair test type 2") | |
125 | victim_osd = chooser(manager, pool, 0) | |
126 | first_mon = teuthology.get_first_mon(ctx, config) | |
127 | (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() | |
128 | ||
129 | # create object | |
130 | log.info("doing put and setomapval") | |
131 | manager.do_put(pool, 'file1', '/etc/hosts') | |
132 | manager.do_rados(mon, ['-p', pool, 'setomapval', 'file1', | |
133 | 'key', 'val']) | |
134 | manager.do_put(pool, 'file2', '/etc/hosts') | |
135 | manager.do_put(pool, 'file3', '/etc/hosts') | |
136 | manager.do_put(pool, 'file4', '/etc/hosts') | |
137 | manager.do_put(pool, 'file5', '/etc/hosts') | |
138 | manager.do_rados(mon, ['-p', pool, 'setomapval', 'file5', | |
139 | 'key', 'val']) | |
140 | manager.do_put(pool, 'file6', '/etc/hosts') | |
141 | ||
142 | # corrupt object | |
143 | log.info("corrupting object") | |
144 | omaperr(manager, victim_osd, pool, 'file1') | |
145 | ||
146 | # verify inconsistent | |
147 | log.info("scrubbing") | |
148 | manager.do_pg_scrub(pool, 0, 'deep-scrub') | |
149 | ||
150 | manager.with_pg_state(pool, 0, lambda s: 'inconsistent' in s) | |
151 | ||
152 | # Regression test for bug #4778, should still | |
153 | # be inconsistent after scrub | |
154 | manager.do_pg_scrub(pool, 0, 'scrub') | |
155 | ||
156 | manager.with_pg_state(pool, 0, lambda s: 'inconsistent' in s) | |
157 | ||
158 | # Additional corruptions including 2 types for file1 | |
159 | log.info("corrupting more objects") | |
160 | dataerr(manager, victim_osd, pool, 'file1') | |
161 | mdataerr(manager, victim_osd, pool, 'file2') | |
162 | trunc(manager, victim_osd, pool, 'file3') | |
163 | omaperr(manager, victim_osd, pool, 'file6') | |
164 | ||
165 | # see still inconsistent | |
166 | log.info("scrubbing") | |
167 | manager.do_pg_scrub(pool, 0, 'deep-scrub') | |
168 | ||
169 | manager.with_pg_state(pool, 0, lambda s: 'inconsistent' in s) | |
170 | ||
171 | # repair | |
172 | log.info("repairing") | |
173 | manager.do_pg_scrub(pool, 0, "repair") | |
174 | ||
175 | # Let repair clear inconsistent flag | |
176 | time.sleep(10) | |
177 | ||
178 | # verify consistent | |
179 | manager.with_pg_state(pool, 0, lambda s: 'inconsistent' not in s) | |
180 | ||
181 | # In the future repair might determine state of | |
182 | # inconsistency itself, verify with a deep-scrub | |
183 | log.info("scrubbing") | |
184 | manager.do_pg_scrub(pool, 0, 'deep-scrub') | |
185 | ||
186 | # verify consistent | |
187 | manager.with_pg_state(pool, 0, lambda s: 'inconsistent' not in s) | |
188 | ||
189 | log.info("done") | |
190 | ||
191 | ||
192 | def hinfoerr(manager, victim, pool, obj): | |
193 | """ | |
194 | cause an error in the hinfo_key | |
195 | """ | |
196 | log.info("remove the hinfo_key") | |
197 | manager.objectstore_tool(pool, | |
198 | options='', | |
199 | args='rm-attr hinfo_key', | |
200 | object_name=obj, | |
201 | osd=victim) | |
202 | ||
203 | ||
204 | def repair_test_erasure_code(manager, corrupter, victim, scrub_type): | |
205 | """ | |
206 | Creates an object in the pool, corrupts it, | |
207 | scrubs it, and verifies that the pool is inconsistent. It then repairs | |
208 | the pool, rescrubs it, and verifies that the pool is consistent | |
209 | ||
210 | :param corrupter: error generating function. | |
211 | :param chooser: osd type chooser (primary or replica) | |
212 | :param scrub_type: regular scrub or deep-scrub | |
213 | """ | |
214 | pool = "repair_pool_3" | |
215 | manager.wait_for_clean() | |
216 | with manager.pool(pool_name=pool, pg_num=1, | |
217 | erasure_code_profile_name='default'): | |
218 | ||
219 | log.info("starting repair test for erasure code") | |
220 | ||
221 | # create object | |
222 | log.info("doing put") | |
223 | manager.do_put(pool, 'repair_test_obj', '/etc/hosts') | |
224 | ||
225 | # corrupt object | |
226 | log.info("corrupting object") | |
227 | corrupter(manager, victim, pool, 'repair_test_obj') | |
228 | ||
229 | # verify inconsistent | |
230 | log.info("scrubbing") | |
231 | manager.do_pg_scrub(pool, 0, scrub_type) | |
232 | ||
233 | manager.with_pg_state(pool, 0, lambda s: 'inconsistent' in s) | |
234 | ||
235 | # repair | |
236 | log.info("repairing") | |
237 | manager.do_pg_scrub(pool, 0, "repair") | |
238 | ||
239 | log.info("re-scrubbing") | |
240 | manager.do_pg_scrub(pool, 0, scrub_type) | |
241 | ||
242 | # verify consistent | |
243 | manager.with_pg_state(pool, 0, lambda s: 'inconsistent' not in s) | |
244 | log.info("done") | |
245 | ||
246 | ||
247 | def task(ctx, config): | |
248 | """ | |
249 | Test [deep] repair in several situations: | |
250 | Repair [Truncate, Data EIO, MData EIO] on [Primary|Replica] | |
251 | ||
252 | The config should be as follows: | |
253 | ||
254 | Must include the log-whitelist below | |
255 | Must enable filestore_debug_inject_read_err config | |
256 | ||
257 | example: | |
258 | ||
259 | tasks: | |
260 | - chef: | |
261 | - install: | |
262 | - ceph: | |
263 | log-whitelist: | |
264 | - 'candidate had a stat error' | |
265 | - 'candidate had a read error' | |
266 | - 'deep-scrub 0 missing, 1 inconsistent objects' | |
267 | - 'deep-scrub 0 missing, 4 inconsistent objects' | |
268 | - 'deep-scrub [0-9]+ errors' | |
269 | - '!= omap_digest' | |
270 | - '!= data_digest' | |
271 | - 'repair 0 missing, 1 inconsistent objects' | |
272 | - 'repair 0 missing, 4 inconsistent objects' | |
273 | - 'repair [0-9]+ errors, [0-9]+ fixed' | |
274 | - 'scrub 0 missing, 1 inconsistent objects' | |
275 | - 'scrub [0-9]+ errors' | |
276 | - 'size 1 != size' | |
277 | - 'attr name mismatch' | |
c07f9fc5 | 278 | - 'Regular scrub request, deep-scrub details will be lost' |
7c673cae FG |
279 | conf: |
280 | osd: | |
281 | filestore debug inject read err: true | |
282 | - repair_test: | |
283 | ||
284 | """ | |
285 | if config is None: | |
286 | config = {} | |
287 | assert isinstance(config, dict), \ | |
288 | 'repair_test task only accepts a dict for config' | |
289 | ||
290 | manager = ctx.managers['ceph'] | |
c07f9fc5 | 291 | manager.wait_for_all_osds_up() |
7c673cae FG |
292 | |
293 | manager.raw_cluster_cmd('osd', 'set', 'noscrub') | |
294 | manager.raw_cluster_cmd('osd', 'set', 'nodeep-scrub') | |
295 | ||
296 | repair_test_1(manager, mdataerr, choose_primary, "scrub") | |
297 | repair_test_1(manager, mdataerr, choose_replica, "scrub") | |
298 | repair_test_1(manager, dataerr, choose_primary, "deep-scrub") | |
299 | repair_test_1(manager, dataerr, choose_replica, "deep-scrub") | |
300 | repair_test_1(manager, trunc, choose_primary, "scrub") | |
301 | repair_test_1(manager, trunc, choose_replica, "scrub") | |
302 | repair_test_2(ctx, manager, config, choose_primary) | |
303 | repair_test_2(ctx, manager, config, choose_replica) | |
304 | ||
305 | repair_test_erasure_code(manager, hinfoerr, 'primary', "deep-scrub") | |
31f18b77 FG |
306 | |
307 | manager.raw_cluster_cmd('osd', 'unset', 'noscrub') | |
308 | manager.raw_cluster_cmd('osd', 'unset', 'nodeep-scrub') |