3 # Copyright (C) 2017 Red Hat <contact@redhat.com>
6 # Author: Kefu Chai <kchai@redhat.com>
7 # Author: David Zafman <dzafman@redhat.com>
9 # This program is free software; you can redistribute it and/or modify
10 # it under the terms of the GNU Library Public License as published by
11 # the Free Software Foundation; either version 2, or (at your option)
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU Library Public License for more details.
20 source $CEPH_ROOT/qa
/standalone
/ceph-helpers.sh
28 export CEPH_MON
="127.0.0.1:7140" # git grep '\<7140\>' : there must be only one
30 CEPH_ARGS
+="--fsid=$(uuidgen) --auth-supported=none "
31 CEPH_ARGS
+="--mon-host=$CEPH_MON "
34 local funcs
=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
35 for func
in $funcs ; do
36 setup
$dir ||
return 1
37 # set warning amount in case default changes
38 run_mon
$dir a
--mon_osd_warn_num_repaired=$warnings ||
return 1
39 run_mgr
$dir x ||
return 1
40 ceph osd pool create foo
8 ||
return 1
42 $func $dir ||
return 1
43 teardown
$dir ||
return 1
47 function setup_osds
() {
52 for id
in $
(seq 0 $
(expr $count - 1)) ; do
53 run_osd
${type} $dir $id ||
return 1
55 wait_for_clean ||
return 1
58 function get_state
() {
61 ceph
--format json pg dump pgs
2>/dev
/null | \
62 jq
-r ".pg_stats | .[] | select(.pgid==\"$pgid\") | .$sname"
65 function rados_put
() {
68 local objname
=${3:-SOMETHING}
70 for marker
in AAA BBB CCCC DDDD
; do
71 printf "%*s" 1024 $marker
74 # get and put an object, compare they are equal
76 rados
--pool $poolname put
$objname $dir/ORIGINAL ||
return 1
79 function rados_get
() {
82 local objname
=${3:-SOMETHING}
86 # Expect a failure to get object
88 if [ $expect = "fail" ];
90 ! rados
--pool $poolname get
$objname $dir/COPY
94 # Expect hang trying to get object
96 if [ $expect = "hang" ];
98 timeout
5 rados
--pool $poolname get
$objname $dir/COPY
103 # get an object, compare with $dir/ORIGINAL
105 rados
--pool $poolname get
$objname $dir/COPY ||
return 1
106 diff $dir/ORIGINAL
$dir/COPY ||
return 1
110 function rados_get_data
() {
115 local poolname
=pool-rep
116 local objname
=obj-
$inject-$$
117 local pgid
=$
(get_pg
$poolname $objname)
119 rados_put
$dir $poolname $objname ||
return 1
120 inject_
$inject rep data
$poolname $objname $dir 0 ||
return 1
121 rados_get
$dir $poolname $objname ||
return 1
124 COUNT
=$
(ceph pg
$pgid query | jq
'.info.stats.stat_sum.num_objects_repaired')
125 test "$COUNT" = "1" ||
return 1
127 COUNT
=$
(ceph pg dump
--format=json-pretty | jq
".pg_map.osd_stats_sum.num_shards_repaired")
128 test "$COUNT" = "1" ||
return 1
130 local object_osds
=($
(get_osds
$poolname $objname))
131 local primary
=${object_osds[0]}
132 local bad_peer
=${object_osds[1]}
133 inject_
$inject rep data
$poolname $objname $dir 0 ||
return 1
134 inject_
$inject rep data
$poolname $objname $dir 1 ||
return 1
135 # Force primary to pull from the bad peer, so we can repair it too!
136 set_config osd
$primary osd_debug_feed_pullee
$bad_peer ||
return 1
137 rados_get
$dir $poolname $objname ||
return 1
139 # Wait until automatic repair of bad peer is done
140 wait_for_clean ||
return 1
142 inject_
$inject rep data
$poolname $objname $dir 0 ||
return 1
143 inject_
$inject rep data
$poolname $objname $dir 2 ||
return 1
144 rados_get
$dir $poolname $objname ||
return 1
147 COUNT
=$
(ceph pg
$pgid query | jq
'.info.stats.stat_sum.num_objects_repaired')
148 test "$COUNT" = "3" ||
return 1
150 COUNT
=$
(ceph pg dump
--format=json-pretty | jq
".pg_map.osd_stats_sum.num_shards_repaired")
151 test "$COUNT" = "4" ||
return 1
153 inject_
$inject rep data
$poolname $objname $dir 0 ||
return 1
154 inject_
$inject rep data
$poolname $objname $dir 1 ||
return 1
155 inject_
$inject rep data
$poolname $objname $dir 2 ||
return 1
156 rados_get
$dir $poolname $objname hang ||
return 1
159 # After hang another repair couldn't happen, so count stays the same
160 COUNT
=$
(ceph pg
$pgid query | jq
'.info.stats.stat_sum.num_objects_repaired')
161 test "$COUNT" = "3" ||
return 1
163 COUNT
=$
(ceph pg dump
--format=json-pretty | jq
".pg_map.osd_stats_sum.num_shards_repaired")
164 test "$COUNT" = "4" ||
return 1
167 function TEST_rados_get_with_eio
() {
170 setup_osds
4 ||
return 1
172 local poolname
=pool-rep
173 create_pool
$poolname 1 1 ||
return 1
174 wait_for_clean ||
return 1
175 rados_get_data eio
$dir ||
return 1
177 delete_pool
$poolname
180 function TEST_rados_repair_warning
() {
182 local OBJS
=$
(expr $warnings + 1)
184 setup_osds
4 ||
return 1
186 local poolname
=pool-rep
187 create_pool
$poolname 1 1 ||
return 1
188 wait_for_clean ||
return 1
190 local poolname
=pool-rep
191 local objbase
=obj-warn
194 for i
in $
(seq 1 $OBJS)
196 rados_put
$dir $poolname ${objbase}-$i ||
return 1
197 inject_
$inject rep data
$poolname ${objbase}-$i $dir 0 ||
return 1
198 rados_get
$dir $poolname ${objbase}-$i ||
return 1
200 local pgid
=$
(get_pg
$poolname ${objbase}-1)
202 local object_osds
=($
(get_osds
$poolname ${objbase}-1))
203 local primary
=${object_osds[0]}
204 local bad_peer
=${object_osds[1]}
207 COUNT
=$
(ceph pg
$pgid query | jq
'.info.stats.stat_sum.num_objects_repaired')
208 test "$COUNT" = "$OBJS" ||
return 1
210 COUNT
=$
(ceph pg dump
--format=json-pretty | jq
".pg_map.osd_stats_sum.num_shards_repaired")
211 test "$COUNT" = "$OBJS" ||
return 1
213 ceph health |
grep -q "Too many repaired reads on 1 OSDs" ||
return 1
214 ceph health detail |
grep -q "osd.$primary had $OBJS reads repaired" ||
return 1
216 ceph health mute OSD_TOO_MANY_REPAIRS
219 ceph health | $
(! grep -q "Too many repaired reads on 1 OSDs") ||
return 1
222 for i
in $
(seq 1 $OBJS)
224 inject_
$inject rep data
$poolname ${objbase}-$i $dir 0 ||
return 1
225 inject_
$inject rep data
$poolname ${objbase}-$i $dir 1 ||
return 1
226 # Force primary to pull from the bad peer, so we can repair it too!
227 set_config osd
$primary osd_debug_feed_pullee
$bad_peer ||
return 1
228 rados_get
$dir $poolname ${objbase}-$i ||
return 1
232 COUNT
=$
(ceph pg
$pgid query | jq
'.info.stats.stat_sum.num_objects_repaired')
233 test "$COUNT" = "$(expr $OBJS \* 2)" ||
return 1
235 COUNT
=$
(ceph pg dump
--format=json-pretty | jq
".pg_map.osd_stats_sum.num_shards_repaired")
236 test "$COUNT" = "$(expr $OBJS \* 3)" ||
return 1
238 # Give mon a chance to notice additional OSD and unmute
239 # The default tick time is 5 seconds
245 if ceph health |
grep -q "Too many repaired reads on 2 OSDs"
249 LOOPS
=$
(expr $LOOPS + 1)
250 if test "$LOOPS" = "$CHECKTIME"
252 echo "Too many repaired reads not seen after $CHECKTIME seconds"
256 ceph health detail |
grep -q "osd.$primary had $(expr $OBJS \* 2) reads repaired" ||
return 1
257 ceph health detail |
grep -q "osd.$bad_peer had $OBJS reads repaired" ||
return 1
259 delete_pool
$poolname
262 # Test backfill with unfound object
263 function TEST_rep_backfill_unfound
() {
265 local objname
=myobject
267 # Must be between 1 and $lastobj
271 CEPH_ARGS
+=' --osd_min_pg_log_entries=5 --osd_max_pg_log_entries=10'
272 setup_osds
3 ||
return 1
274 local poolname
=test-pool
275 create_pool
$poolname 1 1 ||
return 1
276 wait_for_clean ||
return 1
280 rados_put
$dir $poolname $objname ||
return 1
282 local -a initial_osds
=($
(get_osds
$poolname $objname))
283 local last_osd
=${initial_osds[-1]}
284 kill_daemons
$dir TERM osd.
${last_osd} 2>&2 < /dev
/null ||
return 1
285 ceph osd down
${last_osd} ||
return 1
286 ceph osd out
${last_osd} ||
return 1
290 dd if=/dev
/urandom of
=${dir}/ORIGINAL bs
=1024 count
=4
291 for i
in $
(seq 1 $lastobj)
293 rados
--pool $poolname put obj
${i} $dir/ORIGINAL ||
return 1
296 inject_eio rep data
$poolname $testobj $dir 0 ||
return 1
297 inject_eio rep data
$poolname $testobj $dir 1 ||
return 1
299 activate_osd
$dir ${last_osd} ||
return 1
300 ceph osd
in ${last_osd} ||
return 1
304 for tmp
in $
(seq 1 100); do
305 state
=$
(get_state
2.0)
306 echo $state |
grep backfill_unfound
307 if [ "$?" = "0" ]; then
315 ceph pg
2.0 list_unfound |
grep -q $testobj ||
return 1
317 # Command should hang because object is unfound
318 timeout
5 rados
-p $poolname get
$testobj $dir/CHECK
319 test $?
= "124" ||
return 1
321 ceph pg
2.0 mark_unfound_lost delete
323 wait_for_clean ||
return 1
325 for i
in $
(seq 1 $lastobj)
327 if [ obj
${i} = "$testobj" ]; then
328 # Doesn't exist anymore
329 ! rados
-p $poolname get
$testobj $dir/CHECK ||
return 1
331 rados
--pool $poolname get obj
${i} $dir/CHECK ||
return 1
332 diff -q $dir/ORIGINAL
$dir/CHECK ||
return 1
336 rm -f ${dir}/ORIGINAL
${dir}/CHECK
338 delete_pool
$poolname
341 # Test recovery with unfound object
342 function TEST_rep_recovery_unfound
() {
344 local objname
=myobject
346 # Must be between 1 and $lastobj
349 setup_osds
3 ||
return 1
351 local poolname
=test-pool
352 create_pool
$poolname 1 1 ||
return 1
353 wait_for_clean ||
return 1
357 rados_put
$dir $poolname $objname ||
return 1
359 local -a initial_osds
=($
(get_osds
$poolname $objname))
360 local last_osd
=${initial_osds[-1]}
361 kill_daemons
$dir TERM osd.
${last_osd} 2>&2 < /dev
/null ||
return 1
362 ceph osd down
${last_osd} ||
return 1
363 ceph osd out
${last_osd} ||
return 1
367 dd if=/dev
/urandom of
=${dir}/ORIGINAL bs
=1024 count
=4
368 for i
in $
(seq 1 $lastobj)
370 rados
--pool $poolname put obj
${i} $dir/ORIGINAL ||
return 1
373 inject_eio rep data
$poolname $testobj $dir 0 ||
return 1
374 inject_eio rep data
$poolname $testobj $dir 1 ||
return 1
376 activate_osd
$dir ${last_osd} ||
return 1
377 ceph osd
in ${last_osd} ||
return 1
381 for tmp
in $
(seq 1 100); do
382 state
=$
(get_state
2.0)
383 echo $state |
grep -v recovering
384 if [ "$?" = "0" ]; then
392 ceph pg
2.0 list_unfound |
grep -q $testobj ||
return 1
394 # Command should hang because object is unfound
395 timeout
5 rados
-p $poolname get
$testobj $dir/CHECK
396 test $?
= "124" ||
return 1
398 ceph pg
2.0 mark_unfound_lost delete
400 wait_for_clean ||
return 1
402 for i
in $
(seq 1 $lastobj)
404 if [ obj
${i} = "$testobj" ]; then
405 # Doesn't exist anymore
406 ! rados
-p $poolname get
$testobj $dir/CHECK ||
return 1
408 rados
--pool $poolname get obj
${i} $dir/CHECK ||
return 1
409 diff -q $dir/ORIGINAL
$dir/CHECK ||
return 1
413 rm -f ${dir}/ORIGINAL
${dir}/CHECK
415 delete_pool
$poolname
418 # This is a filestore only test because it requires data digest in object info
419 function TEST_rep_read_unfound
() {
421 local objname
=myobject
423 setup_osds
3 _filestore ||
return 1
425 ceph osd pool delete foo foo
--yes-i-really-really-mean-it ||
return 1
426 local poolname
=test-pool
427 create_pool
$poolname 1 1 ||
return 1
428 ceph osd pool
set $poolname size
2
429 wait_for_clean ||
return 1
433 dd if=/dev
/urandom bs
=8k count
=1 of
=$dir/ORIGINAL
434 rados
-p $poolname put
$objname $dir/ORIGINAL
436 local primary
=$
(get_primary
$poolname $objname)
437 local other
=$
(get_not_primary
$poolname $objname)
439 dd if=/dev
/urandom bs
=8k count
=1 of
=$dir/CORRUPT
440 objectstore_tool
$dir $primary $objname set-bytes
$dir/CORRUPT ||
return 1
441 objectstore_tool
$dir $other $objname set-bytes
$dir/CORRUPT ||
return 1
443 timeout
30 rados
-p $poolname get
$objname $dir/tmp
&
448 ceph
--format=json pg dump pgs | jq
'.'
450 if ! ceph
--format=json pg dump pgs | jq
'.pg_stats | .[0].state' |
grep -q recovery_unfound
452 echo "Failure to get to recovery_unfound state"
456 objectstore_tool
$dir $other $objname set-bytes
$dir/ORIGINAL ||
return 1
460 if ! cmp $dir/ORIGINAL
$dir/tmp
462 echo "Bad data after primary repair"
467 main osd-rep-recov-eio.sh
"$@"
470 # compile-command: "cd ../../../build ; make -j4 && ../qa/run-standalone.sh osd-rep-recov-eio.sh"