]> git.proxmox.com Git - ceph.git/blob - ceph/src/test/test_lost.sh
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / test / test_lost.sh
1 #!/usr/bin/env bash
2 set -x
3
4 #
5 # Test the lost object logic
6 #
7
8 # Includes
9 source "`dirname $0`/test_common.sh"
10
11 TEST_POOL=rbd
12
13 # Functions
14 setup() {
15 export CEPH_NUM_OSD=$1
16 vstart_config=$2
17
18 # Start ceph
19 ./stop.sh
20
21 # set recovery start to a really long time to ensure that we don't start recovery
22 ./vstart.sh -d -n -o "$vstart_config" || die "vstart failed"
23
24 # for exiting pools set size not greater than number of OSDs,
25 # so recovery from degraded ps is possible
26 local changed=0
27 for pool in `./ceph osd pool ls`; do
28 local size=`./ceph osd pool get ${pool} size | awk '{print $2}'`
29 if [ "${size}" -gt "${CEPH_NUM_OSD}" ]; then
30 ./ceph osd pool set ${pool} size ${CEPH_NUM_OSD} --yes-i-really-mean-it
31 changed=1
32 fi
33 done
34 if [ ${changed} -eq 1 ]; then
35 # XXX: When a pool has degraded pgs due to size greater than number
36 # of OSDs, after decreasing the size the recovery still could stuck
37 # and requires an additional kick.
38 ./ceph osd out 0
39 ./ceph osd in 0
40 fi
41
42 poll_cmd "./ceph health" HEALTH_OK 1 30
43 }
44
45 recovery1_impl() {
46 # Write lots and lots of objects
47 write_objects 1 1 200 4000 $TEST_POOL
48
49 # Take down osd1
50 stop_osd 1
51
52 # Continue writing a lot of objects
53 write_objects 2 2 200 4000 $TEST_POOL
54
55 # Bring up osd1
56 restart_osd 1
57
58 # Finish peering.
59 sleep 15
60
61 # Stop osd0.
62 # At this point we have peered, but *NOT* recovered.
63 # Objects should be lost.
64 stop_osd 0
65
66 poll_cmd "./ceph pg debug degraded_pgs_exist" TRUE 3 120
67 [ $? -eq 1 ] || die "Failed to see degraded PGs."
68 poll_cmd "./ceph pg debug unfound_objects_exist" TRUE 3 120
69 [ $? -eq 1 ] || die "Failed to see unfound objects."
70 echo "Got unfound objects."
71
72 restart_osd 0
73 sleep 20
74 start_recovery 2
75
76 # Turn on recovery and wait for it to complete.
77 poll_cmd "./ceph pg debug unfound_objects_exist" FALSE 3 120
78 [ $? -eq 1 ] || die "Failed to recover unfound objects."
79 poll_cmd "./ceph pg debug degraded_pgs_exist" FALSE 3 120
80 [ $? -eq 1 ] || die "Recovery never finished."
81 }
82
83 recovery1() {
84 setup 2 'osd recovery delay start = 10000'
85 recovery1_impl
86 }
87
88 lost1_impl() {
89 local flags="$@"
90 local lost_action=delete
91 local pgs_unfound pg
92
93 if is_set revert_lost $flags; then
94 lost_action=revert
95 fi
96
97 # Write lots and lots of objects
98 write_objects 1 1 20 8000 $TEST_POOL
99
100 # Take down osd1
101 stop_osd 1
102
103 # Continue writing a lot of objects
104 write_objects 2 2 20 8000 $TEST_POOL
105
106 # Bring up osd1
107 restart_osd 1
108
109 # Finish peering.
110 sleep 15
111
112 # Stop osd0.
113 # At this point we have peered, but *NOT* recovered.
114 # Objects should be lost.
115 stop_osd 0
116
117 # Since recovery can't proceed, stuff should be unfound.
118 poll_cmd "./ceph pg debug unfound_objects_exist" TRUE 3 120
119 [ $? -eq 1 ] || die "Failed to see unfound objects."
120
121 pgs_unfound=`./ceph health detail |awk '$1 = "pg" && /[0-9] unfound$/ {print $2}'`
122
123 [ -n "$pgs_unfound" ] || die "no pg with unfound objects"
124
125 for pg in $pgs_unfound; do
126 ./ceph pg $pg mark_unfound_lost revert &&
127 die "mark_unfound_lost unexpectedly succeeded for pg $pg"
128 done
129
130 if ! is_set mark_osd_lost $flags && ! is_set rm_osd $flags; then
131 return
132 fi
133
134 if is_set try_to_fetch_unfound $flags; then
135 # Ask for an object while it's still unfound, and
136 # verify we get woken to an error when it's declared lost.
137 echo "trying to get one of the unfound objects"
138 (
139 ./rados -c ./ceph.conf -p $TEST_POOL get obj02 $TEMPDIR/obj02 &&\
140 die "expected radostool error"
141 ) &
142 fi
143
144 if is_set mark_osd_lost $flags; then
145 ./ceph osd lost 0 --yes-i-really-mean-it
146 fi
147
148 if is_set rm_osd $flags; then
149 ./ceph osd rm 0
150 fi
151
152 if ! is_set auto_mark_unfound_lost $flags; then
153 for pg in $pgs_unfound; do
154 ./ceph pg $pg mark_unfound_lost ${lost_action} ||
155 die "mark_unfound_lost failed for pg $pg"
156 done
157 fi
158
159 start_recovery 2
160
161 # Unfound objects go away and are turned into lost objects.
162 poll_cmd "./ceph pg debug unfound_objects_exist" FALSE 3 120
163 [ $? -eq 1 ] || die "Unfound objects didn't go away."
164
165 for pg in `ceph pg ls | awk '/^[0-9]/ {print $1}'`; do
166 ./ceph pg $pg mark_unfound_lost revert 2>&1 |
167 grep 'pg has no unfound objects' ||
168 die "pg $pg has unfound objects"
169 done
170
171 # Reading from a lost object gives back an error code.
172 # TODO: check error code
173 ./rados -c ./ceph.conf -p $TEST_POOL get obj01 $TEMPDIR/obj01
174 if [ lost_action = delete -a $? -eq 0 ]; then
175 die "expected radostool error"
176 elif [ lost_action = revert -a $? -ne 0 ]; then
177 die "unexpected radostool error"
178 fi
179
180 if is_set try_to_fetch_unfound $flags; then
181 echo "waiting for the try_to_fetch_unfound \
182 radostool instance to finish"
183 wait
184 fi
185 }
186
187 lost1() {
188 setup 2 'osd recovery delay start = 10000'
189 lost1_impl mark_osd_lost revert_lost
190 }
191
192 lost2() {
193 setup 2 'osd recovery delay start = 10000'
194 lost1_impl mark_osd_lost try_to_fetch_unfound
195 }
196
197 lost3() {
198 setup 2 'osd recovery delay start = 10000'
199 lost1_impl rm_osd
200 }
201
202 lost4() {
203 setup 2 'osd recovery delay start = 10000'
204 lost1_impl mark_osd_lost rm_osd
205 }
206
207 lost5() {
208 setup 2 'osd recovery delay start = 10000'
209 lost1_impl mark_osd_lost auto_mark_unfound_lost
210 }
211
212 all_osds_die_impl() {
213 poll_cmd "./ceph osd stat" '3 up, 3 in' 20 240
214 [ $? -eq 1 ] || die "didn't start 3 osds"
215
216 stop_osd 0
217 stop_osd 1
218 stop_osd 2
219
220 # wait for the MOSDPGStat timeout
221 poll_cmd "./ceph osd stat" '0 up' 20 240
222 [ $? -eq 1 ] || die "all osds weren't marked as down"
223 }
224
225 all_osds_die() {
226 setup 3 'osd mon report interval = 3
227 mon osd report timeout = 60'
228
229 all_osds_die_impl
230 }
231
232 run() {
233 recovery1 || die "test failed"
234
235 lost1 || die "test failed"
236
237 # XXX: try_to_fetch_unfound test currently hangs on "waiting for the
238 # try_to_fetch_unfound radostool instance to finish"
239 #lost2 || die "test failed"
240
241 lost3 || die "test failed"
242
243 lost4 || die "test failed"
244
245 # XXX: automatically marking lost is not implemented
246 #lost5 || die "test failed"
247
248 all_osds_die || die "test failed"
249 }
250
251 if [ -z "$@" ]; then
252 run
253 echo OK
254 exit 0
255 fi
256
257 $@