]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/repair_test.py
973273bbeacf964d9faec895602247db595cb5b9
[ceph.git] / ceph / qa / tasks / repair_test.py
1 """
2 Test pool repairing after objects are damaged.
3 """
4 import logging
5 import time
6
7 from teuthology import misc as teuthology
8
9 log = logging.getLogger(__name__)
10
11
12 def choose_primary(manager, pool, num):
13 """
14 Return primary to test on.
15 """
16 log.info("Choosing primary")
17 return manager.get_pg_primary(pool, num)
18
19
20 def choose_replica(manager, pool, num):
21 """
22 Return replica to test on.
23 """
24 log.info("Choosing replica")
25 return manager.get_pg_replica(pool, num)
26
27
28 def trunc(manager, osd, pool, obj):
29 """
30 truncate an object
31 """
32 log.info("truncating object")
33 return manager.osd_admin_socket(
34 osd,
35 ['truncobj', pool, obj, '1'])
36
37
38 def dataerr(manager, osd, pool, obj):
39 """
40 cause an error in the data
41 """
42 log.info("injecting data err on object")
43 return manager.osd_admin_socket(
44 osd,
45 ['injectdataerr', pool, obj])
46
47
48 def mdataerr(manager, osd, pool, obj):
49 """
50 cause an error in the mdata
51 """
52 log.info("injecting mdata err on object")
53 return manager.osd_admin_socket(
54 osd,
55 ['injectmdataerr', pool, obj])
56
57
58 def omaperr(manager, osd, pool, obj):
59 """
60 Cause an omap error.
61 """
62 log.info("injecting omap err on object")
63 return manager.osd_admin_socket(osd, ['setomapval', pool, obj,
64 'badkey', 'badval'])
65
66
67 def repair_test_1(manager, corrupter, chooser, scrub_type):
68 """
69 Creates an object in the pool, corrupts it,
70 scrubs it, and verifies that the pool is inconsistent. It then repairs
71 the pool, rescrubs it, and verifies that the pool is consistent
72
73 :param corrupter: error generating function (truncate, data-error, or
74 meta-data error, for example).
75 :param chooser: osd type chooser (primary or replica)
76 :param scrub_type: regular scrub or deep-scrub
77 """
78 pool = "repair_pool_1"
79 manager.wait_for_clean()
80 with manager.pool(pool, 1):
81
82 log.info("starting repair test type 1")
83 victim_osd = chooser(manager, pool, 0)
84
85 # create object
86 log.info("doing put")
87 manager.do_put(pool, 'repair_test_obj', '/etc/hosts')
88
89 # corrupt object
90 log.info("corrupting object")
91 corrupter(manager, victim_osd, pool, 'repair_test_obj')
92
93 # verify inconsistent
94 log.info("scrubbing")
95 manager.do_pg_scrub(pool, 0, scrub_type)
96
97 manager.with_pg_state(pool, 0, lambda s: 'inconsistent' in s)
98
99 # repair
100 log.info("repairing")
101 manager.do_pg_scrub(pool, 0, "repair")
102
103 log.info("re-scrubbing")
104 manager.do_pg_scrub(pool, 0, scrub_type)
105
106 # verify consistent
107 manager.with_pg_state(pool, 0, lambda s: 'inconsistent' not in s)
108 log.info("done")
109
110
111 def repair_test_2(ctx, manager, config, chooser):
112 """
113 First creates a set of objects and
114 sets the omap value. It then corrupts an object, does both a scrub
115 and a deep-scrub, and then corrupts more objects. After that, it
116 repairs the pool and makes sure that the pool is consistent some
117 time after a deep-scrub.
118
119 :param chooser: primary or replica selection routine.
120 """
121 pool = "repair_pool_2"
122 manager.wait_for_clean()
123 with manager.pool(pool, 1):
124 log.info("starting repair test type 2")
125 victim_osd = chooser(manager, pool, 0)
126 first_mon = teuthology.get_first_mon(ctx, config)
127 (mon,) = ctx.cluster.only(first_mon).remotes.keys()
128
129 # create object
130 log.info("doing put and setomapval")
131 manager.do_put(pool, 'file1', '/etc/hosts')
132 manager.do_rados(mon, ['-p', pool, 'setomapval', 'file1',
133 'key', 'val'])
134 manager.do_put(pool, 'file2', '/etc/hosts')
135 manager.do_put(pool, 'file3', '/etc/hosts')
136 manager.do_put(pool, 'file4', '/etc/hosts')
137 manager.do_put(pool, 'file5', '/etc/hosts')
138 manager.do_rados(mon, ['-p', pool, 'setomapval', 'file5',
139 'key', 'val'])
140 manager.do_put(pool, 'file6', '/etc/hosts')
141
142 # corrupt object
143 log.info("corrupting object")
144 omaperr(manager, victim_osd, pool, 'file1')
145
146 # verify inconsistent
147 log.info("scrubbing")
148 manager.do_pg_scrub(pool, 0, 'deep-scrub')
149
150 manager.with_pg_state(pool, 0, lambda s: 'inconsistent' in s)
151
152 # Regression test for bug #4778, should still
153 # be inconsistent after scrub
154 manager.do_pg_scrub(pool, 0, 'scrub')
155
156 manager.with_pg_state(pool, 0, lambda s: 'inconsistent' in s)
157
158 # Additional corruptions including 2 types for file1
159 log.info("corrupting more objects")
160 dataerr(manager, victim_osd, pool, 'file1')
161 mdataerr(manager, victim_osd, pool, 'file2')
162 trunc(manager, victim_osd, pool, 'file3')
163 omaperr(manager, victim_osd, pool, 'file6')
164
165 # see still inconsistent
166 log.info("scrubbing")
167 manager.do_pg_scrub(pool, 0, 'deep-scrub')
168
169 manager.with_pg_state(pool, 0, lambda s: 'inconsistent' in s)
170
171 # repair
172 log.info("repairing")
173 manager.do_pg_scrub(pool, 0, "repair")
174
175 # Let repair clear inconsistent flag
176 time.sleep(10)
177
178 # verify consistent
179 manager.with_pg_state(pool, 0, lambda s: 'inconsistent' not in s)
180
181 # In the future repair might determine state of
182 # inconsistency itself, verify with a deep-scrub
183 log.info("scrubbing")
184 manager.do_pg_scrub(pool, 0, 'deep-scrub')
185
186 # verify consistent
187 manager.with_pg_state(pool, 0, lambda s: 'inconsistent' not in s)
188
189 log.info("done")
190
191
192 def hinfoerr(manager, victim, pool, obj):
193 """
194 cause an error in the hinfo_key
195 """
196 log.info("remove the hinfo_key")
197 manager.objectstore_tool(pool,
198 options='',
199 args='rm-attr hinfo_key',
200 object_name=obj,
201 osd=victim)
202
203
204 def repair_test_erasure_code(manager, corrupter, victim, scrub_type):
205 """
206 Creates an object in the pool, corrupts it,
207 scrubs it, and verifies that the pool is inconsistent. It then repairs
208 the pool, rescrubs it, and verifies that the pool is consistent
209
210 :param corrupter: error generating function.
211 :param chooser: osd type chooser (primary or replica)
212 :param scrub_type: regular scrub or deep-scrub
213 """
214 pool = "repair_pool_3"
215 manager.wait_for_clean()
216 with manager.pool(pool_name=pool, pg_num=1,
217 erasure_code_profile_name='default'):
218
219 log.info("starting repair test for erasure code")
220
221 # create object
222 log.info("doing put")
223 manager.do_put(pool, 'repair_test_obj', '/etc/hosts')
224
225 # corrupt object
226 log.info("corrupting object")
227 corrupter(manager, victim, pool, 'repair_test_obj')
228
229 # verify inconsistent
230 log.info("scrubbing")
231 manager.do_pg_scrub(pool, 0, scrub_type)
232
233 manager.with_pg_state(pool, 0, lambda s: 'inconsistent' in s)
234
235 # repair
236 log.info("repairing")
237 manager.do_pg_scrub(pool, 0, "repair")
238
239 log.info("re-scrubbing")
240 manager.do_pg_scrub(pool, 0, scrub_type)
241
242 # verify consistent
243 manager.with_pg_state(pool, 0, lambda s: 'inconsistent' not in s)
244 log.info("done")
245
246
247 def task(ctx, config):
248 """
249 Test [deep] repair in several situations:
250 Repair [Truncate, Data EIO, MData EIO] on [Primary|Replica]
251
252 The config should be as follows:
253
254 Must include the log-whitelist below
255 Must enable filestore_debug_inject_read_err config
256
257 example:
258
259 tasks:
260 - chef:
261 - install:
262 - ceph:
263 log-whitelist:
264 - 'candidate had a stat error'
265 - 'candidate had a read error'
266 - 'deep-scrub 0 missing, 1 inconsistent objects'
267 - 'deep-scrub 0 missing, 4 inconsistent objects'
268 - 'deep-scrub [0-9]+ errors'
269 - '!= omap_digest'
270 - '!= data_digest'
271 - 'repair 0 missing, 1 inconsistent objects'
272 - 'repair 0 missing, 4 inconsistent objects'
273 - 'repair [0-9]+ errors, [0-9]+ fixed'
274 - 'scrub 0 missing, 1 inconsistent objects'
275 - 'scrub [0-9]+ errors'
276 - 'size 1 != size'
277 - 'attr name mismatch'
278 - 'Regular scrub request, deep-scrub details will be lost'
279 - 'candidate size [0-9]+ info size [0-9]+ mismatch'
280 conf:
281 osd:
282 filestore debug inject read err: true
283 - repair_test:
284
285 """
286 if config is None:
287 config = {}
288 assert isinstance(config, dict), \
289 'repair_test task only accepts a dict for config'
290
291 manager = ctx.managers['ceph']
292 manager.wait_for_all_osds_up()
293
294 manager.raw_cluster_cmd('osd', 'set', 'noscrub')
295 manager.raw_cluster_cmd('osd', 'set', 'nodeep-scrub')
296
297 repair_test_1(manager, mdataerr, choose_primary, "scrub")
298 repair_test_1(manager, mdataerr, choose_replica, "scrub")
299 repair_test_1(manager, dataerr, choose_primary, "deep-scrub")
300 repair_test_1(manager, dataerr, choose_replica, "deep-scrub")
301 repair_test_1(manager, trunc, choose_primary, "scrub")
302 repair_test_1(manager, trunc, choose_replica, "scrub")
303 repair_test_2(ctx, manager, config, choose_primary)
304 repair_test_2(ctx, manager, config, choose_replica)
305
306 repair_test_erasure_code(manager, hinfoerr, 'primary', "deep-scrub")
307
308 manager.raw_cluster_cmd('osd', 'unset', 'noscrub')
309 manager.raw_cluster_cmd('osd', 'unset', 'nodeep-scrub')