3 # Copyright (C) 2017 Red Hat <contact@redhat.com>
6 # Author: Kefu Chai <kchai@redhat.com>
7 # Author: David Zafman <dzafman@redhat.com>
9 # This program is free software; you can redistribute it and/or modify
10 # it under the terms of the GNU Library Public License as published by
11 # the Free Software Foundation; either version 2, or (at your option)
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU Library Public License for more details.
20 source $CEPH_ROOT/qa
/standalone
/ceph-helpers.sh
26 export CEPH_MON
="127.0.0.1:7140" # git grep '\<7140\>' : there must be only one
28 CEPH_ARGS
+="--fsid=$(uuidgen) --auth-supported=none "
29 CEPH_ARGS
+="--mon-host=$CEPH_MON "
32 local funcs
=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
33 for func
in $funcs ; do
34 setup
$dir ||
return 1
35 run_mon
$dir a ||
return 1
36 run_mgr
$dir x ||
return 1
37 ceph osd pool create foo
8 ||
return 1
39 $func $dir ||
return 1
40 teardown
$dir ||
return 1
44 function setup_osds
() {
49 for id
in $
(seq 0 $
(expr $count - 1)) ; do
50 run_osd
${type} $dir $id ||
return 1
52 wait_for_clean ||
return 1
55 function get_state
() {
58 ceph
--format json pg dump pgs
2>/dev
/null | \
59 jq
-r ".pg_stats | .[] | select(.pgid==\"$pgid\") | .$sname"
62 function rados_put
() {
65 local objname
=${3:-SOMETHING}
67 for marker
in AAA BBB CCCC DDDD
; do
68 printf "%*s" 1024 $marker
71 # get and put an object, compare they are equal
73 rados
--pool $poolname put
$objname $dir/ORIGINAL ||
return 1
76 function rados_get
() {
79 local objname
=${3:-SOMETHING}
83 # Expect a failure to get object
85 if [ $expect = "fail" ];
87 ! rados
--pool $poolname get
$objname $dir/COPY
91 # Expect hang trying to get object
93 if [ $expect = "hang" ];
95 timeout
5 rados
--pool $poolname get
$objname $dir/COPY
100 # get an object, compare with $dir/ORIGINAL
102 rados
--pool $poolname get
$objname $dir/COPY ||
return 1
103 diff $dir/ORIGINAL
$dir/COPY ||
return 1
107 function rados_get_data
() {
112 local poolname
=pool-rep
113 local objname
=obj-
$inject-$$
114 local pgid
=$
(get_pg
$poolname $objname)
116 rados_put
$dir $poolname $objname ||
return 1
117 inject_
$inject rep data
$poolname $objname $dir 0 ||
return 1
118 rados_get
$dir $poolname $objname ||
return 1
120 COUNT
=$
(ceph pg
$pgid query | jq
'.info.stats.stat_sum.num_objects_repaired')
121 test "$COUNT" = "1" ||
return 1
123 COUNT
=$
(ceph pg dump
--format=json-pretty | jq
".pg_map.osd_stats_sum.num_shards_repaired")
124 test "$COUNT" = "1" ||
return 1
126 local object_osds
=($
(get_osds
$poolname $objname))
127 local primary
=${object_osds[0]}
128 local bad_peer
=${object_osds[1]}
129 inject_
$inject rep data
$poolname $objname $dir 0 ||
return 1
130 inject_
$inject rep data
$poolname $objname $dir 1 ||
return 1
131 # Force primary to pull from the bad peer, so we can repair it too!
132 set_config osd
$primary osd_debug_feed_pullee
$bad_peer ||
return 1
133 rados_get
$dir $poolname $objname ||
return 1
135 # Wait until automatic repair of bad peer is done
136 wait_for_clean ||
return 1
138 inject_
$inject rep data
$poolname $objname $dir 0 ||
return 1
139 inject_
$inject rep data
$poolname $objname $dir 2 ||
return 1
140 rados_get
$dir $poolname $objname ||
return 1
142 COUNT
=$
(ceph pg
$pgid query | jq
'.info.stats.stat_sum.num_objects_repaired')
143 test "$COUNT" = "3" ||
return 1
145 COUNT
=$
(ceph pg dump
--format=json-pretty | jq
".pg_map.osd_stats_sum.num_shards_repaired")
146 test "$COUNT" = "4" ||
return 1
148 inject_
$inject rep data
$poolname $objname $dir 0 ||
return 1
149 inject_
$inject rep data
$poolname $objname $dir 1 ||
return 1
150 inject_
$inject rep data
$poolname $objname $dir 2 ||
return 1
151 rados_get
$dir $poolname $objname hang ||
return 1
153 # After hang another repair couldn't happen, so count stays the same
154 COUNT
=$
(ceph pg
$pgid query | jq
'.info.stats.stat_sum.num_objects_repaired')
155 test "$COUNT" = "3" ||
return 1
157 COUNT
=$
(ceph pg dump
--format=json-pretty | jq
".pg_map.osd_stats_sum.num_shards_repaired")
158 test "$COUNT" = "4" ||
return 1
161 function TEST_rados_get_with_eio
() {
164 setup_osds
4 ||
return 1
166 local poolname
=pool-rep
167 create_pool
$poolname 1 1 ||
return 1
168 wait_for_clean ||
return 1
169 rados_get_data eio
$dir ||
return 1
171 delete_pool
$poolname
174 # Test backfill with unfound object
175 function TEST_rep_backfill_unfound
() {
177 local objname
=myobject
179 # Must be between 1 and $lastobj
183 CEPH_ARGS
+=' --osd_min_pg_log_entries=5 --osd_max_pg_log_entries=10'
184 setup_osds
3 ||
return 1
186 local poolname
=test-pool
187 create_pool
$poolname 1 1 ||
return 1
188 wait_for_clean ||
return 1
192 rados_put
$dir $poolname $objname ||
return 1
194 local -a initial_osds
=($
(get_osds
$poolname $objname))
195 local last_osd
=${initial_osds[-1]}
196 kill_daemons
$dir TERM osd.
${last_osd} 2>&2 < /dev
/null ||
return 1
197 ceph osd down
${last_osd} ||
return 1
198 ceph osd out
${last_osd} ||
return 1
202 dd if=/dev
/urandom of
=${dir}/ORIGINAL bs
=1024 count
=4
203 for i
in $
(seq 1 $lastobj)
205 rados
--pool $poolname put obj
${i} $dir/ORIGINAL ||
return 1
208 inject_eio rep data
$poolname $testobj $dir 0 ||
return 1
209 inject_eio rep data
$poolname $testobj $dir 1 ||
return 1
211 activate_osd
$dir ${last_osd} ||
return 1
212 ceph osd
in ${last_osd} ||
return 1
216 for tmp
in $
(seq 1 100); do
217 state
=$
(get_state
2.0)
218 echo $state |
grep backfill_unfound
219 if [ "$?" = "0" ]; then
227 ceph pg
2.0 list_unfound |
grep -q $testobj ||
return 1
229 # Command should hang because object is unfound
230 timeout
5 rados
-p $poolname get
$testobj $dir/CHECK
231 test $?
= "124" ||
return 1
233 ceph pg
2.0 mark_unfound_lost delete
235 wait_for_clean ||
return 1
237 for i
in $
(seq 1 $lastobj)
239 if [ obj
${i} = "$testobj" ]; then
240 # Doesn't exist anymore
241 ! rados
-p $poolname get
$testobj $dir/CHECK ||
return 1
243 rados
--pool $poolname get obj
${i} $dir/CHECK ||
return 1
244 diff -q $dir/ORIGINAL
$dir/CHECK ||
return 1
248 rm -f ${dir}/ORIGINAL
${dir}/CHECK
250 delete_pool
$poolname
253 # Test recovery with unfound object
254 function TEST_rep_recovery_unfound
() {
256 local objname
=myobject
258 # Must be between 1 and $lastobj
261 setup_osds
3 ||
return 1
263 local poolname
=test-pool
264 create_pool
$poolname 1 1 ||
return 1
265 wait_for_clean ||
return 1
269 rados_put
$dir $poolname $objname ||
return 1
271 local -a initial_osds
=($
(get_osds
$poolname $objname))
272 local last_osd
=${initial_osds[-1]}
273 kill_daemons
$dir TERM osd.
${last_osd} 2>&2 < /dev
/null ||
return 1
274 ceph osd down
${last_osd} ||
return 1
275 ceph osd out
${last_osd} ||
return 1
279 dd if=/dev
/urandom of
=${dir}/ORIGINAL bs
=1024 count
=4
280 for i
in $
(seq 1 $lastobj)
282 rados
--pool $poolname put obj
${i} $dir/ORIGINAL ||
return 1
285 inject_eio rep data
$poolname $testobj $dir 0 ||
return 1
286 inject_eio rep data
$poolname $testobj $dir 1 ||
return 1
288 activate_osd
$dir ${last_osd} ||
return 1
289 ceph osd
in ${last_osd} ||
return 1
293 for tmp
in $
(seq 1 100); do
294 state
=$
(get_state
2.0)
295 echo $state |
grep -v recovering
296 if [ "$?" = "0" ]; then
304 ceph pg
2.0 list_unfound |
grep -q $testobj ||
return 1
306 # Command should hang because object is unfound
307 timeout
5 rados
-p $poolname get
$testobj $dir/CHECK
308 test $?
= "124" ||
return 1
310 ceph pg
2.0 mark_unfound_lost delete
312 wait_for_clean ||
return 1
314 for i
in $
(seq 1 $lastobj)
316 if [ obj
${i} = "$testobj" ]; then
317 # Doesn't exist anymore
318 ! rados
-p $poolname get
$testobj $dir/CHECK ||
return 1
320 rados
--pool $poolname get obj
${i} $dir/CHECK ||
return 1
321 diff -q $dir/ORIGINAL
$dir/CHECK ||
return 1
325 rm -f ${dir}/ORIGINAL
${dir}/CHECK
327 delete_pool
$poolname
330 # This is a filestore only test because it requires data digest in object info
331 function TEST_rep_read_unfound
() {
333 local objname
=myobject
335 setup_osds
3 _filestore ||
return 1
337 ceph osd pool delete foo foo
--yes-i-really-really-mean-it ||
return 1
338 local poolname
=test-pool
339 create_pool
$poolname 1 1 ||
return 1
340 ceph osd pool
set $poolname size
2
341 wait_for_clean ||
return 1
345 dd if=/dev
/urandom bs
=8k count
=1 of
=$dir/ORIGINAL
346 rados
-p $poolname put
$objname $dir/ORIGINAL
348 local primary
=$
(get_primary
$poolname $objname)
349 local other
=$
(get_not_primary
$poolname $objname)
351 dd if=/dev
/urandom bs
=8k count
=1 of
=$dir/CORRUPT
352 objectstore_tool
$dir $primary $objname set-bytes
$dir/CORRUPT ||
return 1
353 objectstore_tool
$dir $other $objname set-bytes
$dir/CORRUPT ||
return 1
355 timeout
30 rados
-p $poolname get
$objname $dir/tmp
&
360 ceph
--format=json pg dump pgs | jq
'.'
362 if ! ceph
--format=json pg dump pgs | jq
'.pg_stats | .[0].state' |
grep -q recovery_unfound
364 echo "Failure to get to recovery_unfound state"
368 objectstore_tool
$dir $other $objname set-bytes
$dir/ORIGINAL ||
return 1
372 if ! cmp $dir/ORIGINAL
$dir/tmp
374 echo "Bad data after primary repair"
379 main osd-rep-recov-eio.sh
"$@"
382 # compile-command: "cd ../../../build ; make -j4 && ../qa/run-standalone.sh osd-rep-recov-eio.sh"