2 Test pool repairing after objects are damaged.
7 from teuthology
import misc
as teuthology
9 log
= logging
.getLogger(__name__
)
12 def choose_primary(manager
, pool
, num
):
14 Return primary to test on.
16 log
.info("Choosing primary")
17 return manager
.get_pg_primary(pool
, num
)
20 def choose_replica(manager
, pool
, num
):
22 Return replica to test on.
24 log
.info("Choosing replica")
25 return manager
.get_pg_replica(pool
, num
)
28 def trunc(manager
, osd
, pool
, obj
):
32 log
.info("truncating object")
33 return manager
.osd_admin_socket(
35 ['truncobj', pool
, obj
, '1'])
38 def dataerr(manager
, osd
, pool
, obj
):
40 cause an error in the data
42 log
.info("injecting data err on object")
43 return manager
.osd_admin_socket(
45 ['injectdataerr', pool
, obj
])
48 def mdataerr(manager
, osd
, pool
, obj
):
50 cause an error in the mdata
52 log
.info("injecting mdata err on object")
53 return manager
.osd_admin_socket(
55 ['injectmdataerr', pool
, obj
])
58 def omaperr(manager
, osd
, pool
, obj
):
62 log
.info("injecting omap err on object")
63 return manager
.osd_admin_socket(osd
, ['setomapval', pool
, obj
,
67 def repair_test_1(manager
, corrupter
, chooser
, scrub_type
):
69 Creates an object in the pool, corrupts it,
70 scrubs it, and verifies that the pool is inconsistent. It then repairs
71 the pool, rescrubs it, and verifies that the pool is consistent
73 :param corrupter: error generating function (truncate, data-error, or
74 meta-data error, for example).
75 :param chooser: osd type chooser (primary or replica)
76 :param scrub_type: regular scrub or deep-scrub
78 pool
= "repair_pool_1"
79 manager
.wait_for_clean()
80 with manager
.pool(pool
, 1):
82 log
.info("starting repair test type 1")
83 victim_osd
= chooser(manager
, pool
, 0)
87 manager
.do_put(pool
, 'repair_test_obj', '/etc/hosts')
90 log
.info("corrupting object")
91 corrupter(manager
, victim_osd
, pool
, 'repair_test_obj')
95 manager
.do_pg_scrub(pool
, 0, scrub_type
)
97 manager
.with_pg_state(pool
, 0, lambda s
: 'inconsistent' in s
)
100 log
.info("repairing")
101 manager
.do_pg_scrub(pool
, 0, "repair")
103 log
.info("re-scrubbing")
104 manager
.do_pg_scrub(pool
, 0, scrub_type
)
107 manager
.with_pg_state(pool
, 0, lambda s
: 'inconsistent' not in s
)
111 def repair_test_2(ctx
, manager
, config
, chooser
):
113 First creates a set of objects and
114 sets the omap value. It then corrupts an object, does both a scrub
115 and a deep-scrub, and then corrupts more objects. After that, it
116 repairs the pool and makes sure that the pool is consistent some
117 time after a deep-scrub.
119 :param chooser: primary or replica selection routine.
121 pool
= "repair_pool_2"
122 manager
.wait_for_clean()
123 with manager
.pool(pool
, 1):
124 log
.info("starting repair test type 2")
125 victim_osd
= chooser(manager
, pool
, 0)
126 first_mon
= teuthology
.get_first_mon(ctx
, config
)
127 (mon
,) = ctx
.cluster
.only(first_mon
).remotes
.iterkeys()
130 log
.info("doing put and setomapval")
131 manager
.do_put(pool
, 'file1', '/etc/hosts')
132 manager
.do_rados(mon
, ['-p', pool
, 'setomapval', 'file1',
134 manager
.do_put(pool
, 'file2', '/etc/hosts')
135 manager
.do_put(pool
, 'file3', '/etc/hosts')
136 manager
.do_put(pool
, 'file4', '/etc/hosts')
137 manager
.do_put(pool
, 'file5', '/etc/hosts')
138 manager
.do_rados(mon
, ['-p', pool
, 'setomapval', 'file5',
140 manager
.do_put(pool
, 'file6', '/etc/hosts')
143 log
.info("corrupting object")
144 omaperr(manager
, victim_osd
, pool
, 'file1')
146 # verify inconsistent
147 log
.info("scrubbing")
148 manager
.do_pg_scrub(pool
, 0, 'deep-scrub')
150 manager
.with_pg_state(pool
, 0, lambda s
: 'inconsistent' in s
)
152 # Regression test for bug #4778, should still
153 # be inconsistent after scrub
154 manager
.do_pg_scrub(pool
, 0, 'scrub')
156 manager
.with_pg_state(pool
, 0, lambda s
: 'inconsistent' in s
)
158 # Additional corruptions including 2 types for file1
159 log
.info("corrupting more objects")
160 dataerr(manager
, victim_osd
, pool
, 'file1')
161 mdataerr(manager
, victim_osd
, pool
, 'file2')
162 trunc(manager
, victim_osd
, pool
, 'file3')
163 omaperr(manager
, victim_osd
, pool
, 'file6')
165 # see still inconsistent
166 log
.info("scrubbing")
167 manager
.do_pg_scrub(pool
, 0, 'deep-scrub')
169 manager
.with_pg_state(pool
, 0, lambda s
: 'inconsistent' in s
)
172 log
.info("repairing")
173 manager
.do_pg_scrub(pool
, 0, "repair")
175 # Let repair clear inconsistent flag
179 manager
.with_pg_state(pool
, 0, lambda s
: 'inconsistent' not in s
)
181 # In the future repair might determine state of
182 # inconsistency itself, verify with a deep-scrub
183 log
.info("scrubbing")
184 manager
.do_pg_scrub(pool
, 0, 'deep-scrub')
187 manager
.with_pg_state(pool
, 0, lambda s
: 'inconsistent' not in s
)
192 def hinfoerr(manager
, victim
, pool
, obj
):
194 cause an error in the hinfo_key
196 log
.info("remove the hinfo_key")
197 manager
.objectstore_tool(pool
,
199 args
='rm-attr hinfo_key',
204 def repair_test_erasure_code(manager
, corrupter
, victim
, scrub_type
):
206 Creates an object in the pool, corrupts it,
207 scrubs it, and verifies that the pool is inconsistent. It then repairs
208 the pool, rescrubs it, and verifies that the pool is consistent
210 :param corrupter: error generating function.
211 :param chooser: osd type chooser (primary or replica)
212 :param scrub_type: regular scrub or deep-scrub
214 pool
= "repair_pool_3"
215 manager
.wait_for_clean()
216 with manager
.pool(pool_name
=pool
, pg_num
=1,
217 erasure_code_profile_name
='default'):
219 log
.info("starting repair test for erasure code")
222 log
.info("doing put")
223 manager
.do_put(pool
, 'repair_test_obj', '/etc/hosts')
226 log
.info("corrupting object")
227 corrupter(manager
, victim
, pool
, 'repair_test_obj')
229 # verify inconsistent
230 log
.info("scrubbing")
231 manager
.do_pg_scrub(pool
, 0, scrub_type
)
233 manager
.with_pg_state(pool
, 0, lambda s
: 'inconsistent' in s
)
236 log
.info("repairing")
237 manager
.do_pg_scrub(pool
, 0, "repair")
239 log
.info("re-scrubbing")
240 manager
.do_pg_scrub(pool
, 0, scrub_type
)
243 manager
.with_pg_state(pool
, 0, lambda s
: 'inconsistent' not in s
)
247 def task(ctx
, config
):
249 Test [deep] repair in several situations:
250 Repair [Truncate, Data EIO, MData EIO] on [Primary|Replica]
252 The config should be as follows:
254 Must include the log-whitelist below
255 Must enable filestore_debug_inject_read_err config
264 - 'candidate had a stat error'
265 - 'candidate had a read error'
266 - 'deep-scrub 0 missing, 1 inconsistent objects'
267 - 'deep-scrub 0 missing, 4 inconsistent objects'
268 - 'deep-scrub [0-9]+ errors'
271 - 'repair 0 missing, 1 inconsistent objects'
272 - 'repair 0 missing, 4 inconsistent objects'
273 - 'repair [0-9]+ errors, [0-9]+ fixed'
274 - 'scrub 0 missing, 1 inconsistent objects'
275 - 'scrub [0-9]+ errors'
277 - 'attr name mismatch'
278 - 'Regular scrub request, losing deep-scrub details'
281 filestore debug inject read err: true
287 assert isinstance(config
, dict), \
288 'repair_test task only accepts a dict for config'
290 manager
= ctx
.managers
['ceph']
291 manager
.wait_for_all_up()
293 manager
.raw_cluster_cmd('osd', 'set', 'noscrub')
294 manager
.raw_cluster_cmd('osd', 'set', 'nodeep-scrub')
296 repair_test_1(manager
, mdataerr
, choose_primary
, "scrub")
297 repair_test_1(manager
, mdataerr
, choose_replica
, "scrub")
298 repair_test_1(manager
, dataerr
, choose_primary
, "deep-scrub")
299 repair_test_1(manager
, dataerr
, choose_replica
, "deep-scrub")
300 repair_test_1(manager
, trunc
, choose_primary
, "scrub")
301 repair_test_1(manager
, trunc
, choose_replica
, "scrub")
302 repair_test_2(ctx
, manager
, config
, choose_primary
)
303 repair_test_2(ctx
, manager
, config
, choose_replica
)
305 repair_test_erasure_code(manager
, hinfoerr
, 'primary', "deep-scrub")
307 manager
.raw_cluster_cmd('osd', 'unset', 'noscrub')
308 manager
.raw_cluster_cmd('osd', 'unset', 'nodeep-scrub')