5 # Test the lost object logic
9 source "`dirname $0`/test_common.sh"
15 export CEPH_NUM_OSD
=$1
21 # set recovery start to a really long time to ensure that we don't start recovery
22 .
/vstart.sh
-d -n -o "$vstart_config" || die
"vstart failed"
24 # for exiting pools set size not greater than number of OSDs,
25 # so recovery from degraded ps is possible
27 for pool
in `./ceph osd pool ls`; do
28 local size
=`./ceph osd pool get ${pool} size | awk '{print $2}'`
29 if [ "${size}" -gt "${CEPH_NUM_OSD}" ]; then
30 .
/ceph osd pool
set ${pool} size
${CEPH_NUM_OSD} --yes-i-really-mean-it
34 if [ ${changed} -eq 1 ]; then
35 # XXX: When a pool has degraded pgs due to size greater than number
36 # of OSDs, after decreasing the size the recovery still could stuck
37 # and requires an additional kick.
42 poll_cmd
"./ceph health" HEALTH_OK
1 30
46 # Write lots and lots of objects
47 write_objects
1 1 200 4000 $TEST_POOL
52 # Continue writing a lot of objects
53 write_objects
2 2 200 4000 $TEST_POOL
62 # At this point we have peered, but *NOT* recovered.
63 # Objects should be lost.
66 poll_cmd
"./ceph pg debug degraded_pgs_exist" TRUE
3 120
67 [ $?
-eq 1 ] || die
"Failed to see degraded PGs."
68 poll_cmd
"./ceph pg debug unfound_objects_exist" TRUE
3 120
69 [ $?
-eq 1 ] || die
"Failed to see unfound objects."
70 echo "Got unfound objects."
76 # Turn on recovery and wait for it to complete.
77 poll_cmd
"./ceph pg debug unfound_objects_exist" FALSE
3 120
78 [ $?
-eq 1 ] || die
"Failed to recover unfound objects."
79 poll_cmd
"./ceph pg debug degraded_pgs_exist" FALSE
3 120
80 [ $?
-eq 1 ] || die
"Recovery never finished."
84 setup
2 'osd recovery delay start = 10000'
90 local lost_action
=delete
93 if is_set revert_lost
$flags; then
97 # Write lots and lots of objects
98 write_objects
1 1 20 8000 $TEST_POOL
103 # Continue writing a lot of objects
104 write_objects
2 2 20 8000 $TEST_POOL
113 # At this point we have peered, but *NOT* recovered.
114 # Objects should be lost.
117 # Since recovery can't proceed, stuff should be unfound.
118 poll_cmd
"./ceph pg debug unfound_objects_exist" TRUE
3 120
119 [ $?
-eq 1 ] || die
"Failed to see unfound objects."
121 pgs_unfound
=`./ceph health detail |awk '$1 = "pg" && /[0-9] unfound$/ {print $2}'`
123 [ -n "$pgs_unfound" ] || die
"no pg with unfound objects"
125 for pg
in $pgs_unfound; do
126 .
/ceph pg
$pg mark_unfound_lost revert
&&
127 die
"mark_unfound_lost unexpectedly succeeded for pg $pg"
130 if ! is_set mark_osd_lost
$flags && ! is_set rm_osd
$flags; then
134 if is_set try_to_fetch_unfound
$flags; then
135 # Ask for an object while it's still unfound, and
136 # verify we get woken to an error when it's declared lost.
137 echo "trying to get one of the unfound objects"
139 .
/rados
-c .
/ceph.conf
-p $TEST_POOL get obj02
$TEMPDIR/obj02
&&\
140 die
"expected radostool error"
144 if is_set mark_osd_lost
$flags; then
145 .
/ceph osd lost
0 --yes-i-really-mean-it
148 if is_set rm_osd
$flags; then
152 if ! is_set auto_mark_unfound_lost
$flags; then
153 for pg
in $pgs_unfound; do
154 .
/ceph pg
$pg mark_unfound_lost
${lost_action} ||
155 die
"mark_unfound_lost failed for pg $pg"
161 # Unfound objects go away and are turned into lost objects.
162 poll_cmd
"./ceph pg debug unfound_objects_exist" FALSE
3 120
163 [ $?
-eq 1 ] || die
"Unfound objects didn't go away."
165 for pg
in `ceph pg ls | awk '/^[0-9]/ {print $1}'`; do
166 .
/ceph pg
$pg mark_unfound_lost revert
2>&1 |
167 grep 'pg has no unfound objects' ||
168 die
"pg $pg has unfound objects"
171 # Reading from a lost object gives back an error code.
172 # TODO: check error code
173 .
/rados
-c .
/ceph.conf
-p $TEST_POOL get obj01
$TEMPDIR/obj01
174 if [ lost_action
= delete
-a $?
-eq 0 ]; then
175 die
"expected radostool error"
176 elif [ lost_action
= revert
-a $?
-ne 0 ]; then
177 die
"unexpected radostool error"
180 if is_set try_to_fetch_unfound
$flags; then
181 echo "waiting for the try_to_fetch_unfound \
182 radostool instance to finish"
188 setup
2 'osd recovery delay start = 10000'
189 lost1_impl mark_osd_lost revert_lost
193 setup
2 'osd recovery delay start = 10000'
194 lost1_impl mark_osd_lost try_to_fetch_unfound
198 setup
2 'osd recovery delay start = 10000'
203 setup
2 'osd recovery delay start = 10000'
204 lost1_impl mark_osd_lost rm_osd
208 setup
2 'osd recovery delay start = 10000'
209 lost1_impl mark_osd_lost auto_mark_unfound_lost
212 all_osds_die_impl
() {
213 poll_cmd
"./ceph osd stat" '3 up, 3 in' 20 240
214 [ $?
-eq 1 ] || die
"didn't start 3 osds"
220 # wait for the MOSDPGStat timeout
221 poll_cmd
"./ceph osd stat" '0 up' 20 240
222 [ $?
-eq 1 ] || die
"all osds weren't marked as down"
226 setup
3 'osd mon report interval = 3
227 mon osd report timeout = 60'
233 recovery1 || die
"test failed"
235 lost1 || die
"test failed"
237 # XXX: try_to_fetch_unfound test currently hangs on "waiting for the
238 # try_to_fetch_unfound radostool instance to finish"
239 #lost2 || die "test failed"
241 lost3 || die
"test failed"
243 lost4 || die
"test failed"
245 # XXX: automatically marking lost is not implemented
246 #lost5 || die "test failed"
248 all_osds_die || die
"test failed"