]> git.proxmox.com Git - ceph.git/blob - ceph/src/test/test_lost.sh
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / test / test_lost.sh
1 #!/bin/bash -x
2
3 #
4 # Test the lost object logic
5 #
6
7 # Includes
8 source "`dirname $0`/test_common.sh"
9
10 TEST_POOL=rbd
11
12 # Functions
13 setup() {
14 export CEPH_NUM_OSD=$1
15 vstart_config=$2
16
17 # Start ceph
18 ./stop.sh
19
20 # set recovery start to a really long time to ensure that we don't start recovery
21 ./vstart.sh -d -n -o "$vstart_config" || die "vstart failed"
22
23 # for exiting pools set size not greater than number of OSDs,
24 # so recovery from degraded ps is possible
25 local changed=0
26 for pool in `./ceph osd pool ls`; do
27 local size=`./ceph osd pool get ${pool} size | awk '{print $2}'`
28 if [ "${size}" -gt "${CEPH_NUM_OSD}" ]; then
29 ./ceph osd pool set ${pool} size ${CEPH_NUM_OSD}
30 changed=1
31 fi
32 done
33 if [ ${changed} -eq 1 ]; then
34 # XXX: When a pool has degraded pgs due to size greater than number
35 # of OSDs, after decreasing the size the recovery still could stuck
36 # and requires an additional kick.
37 ./ceph osd out 0
38 ./ceph osd in 0
39 fi
40
41 poll_cmd "./ceph health" HEALTH_OK 1 30
42 }
43
44 recovery1_impl() {
45 # Write lots and lots of objects
46 write_objects 1 1 200 4000 $TEST_POOL
47
48 # Take down osd1
49 stop_osd 1
50
51 # Continue writing a lot of objects
52 write_objects 2 2 200 4000 $TEST_POOL
53
54 # Bring up osd1
55 restart_osd 1
56
57 # Finish peering.
58 sleep 15
59
60 # Stop osd0.
61 # At this point we have peered, but *NOT* recovered.
62 # Objects should be lost.
63 stop_osd 0
64
65 poll_cmd "./ceph pg debug degraded_pgs_exist" TRUE 3 120
66 [ $? -eq 1 ] || die "Failed to see degraded PGs."
67 poll_cmd "./ceph pg debug unfound_objects_exist" TRUE 3 120
68 [ $? -eq 1 ] || die "Failed to see unfound objects."
69 echo "Got unfound objects."
70
71 restart_osd 0
72 sleep 20
73 start_recovery 2
74
75 # Turn on recovery and wait for it to complete.
76 poll_cmd "./ceph pg debug unfound_objects_exist" FALSE 3 120
77 [ $? -eq 1 ] || die "Failed to recover unfound objects."
78 poll_cmd "./ceph pg debug degraded_pgs_exist" FALSE 3 120
79 [ $? -eq 1 ] || die "Recovery never finished."
80 }
81
82 recovery1() {
83 setup 2 'osd recovery delay start = 10000'
84 recovery1_impl
85 }
86
87 lost1_impl() {
88 local flags="$@"
89 local lost_action=delete
90 local pgs_unfound pg
91
92 if is_set revert_lost $flags; then
93 lost_action=revert
94 fi
95
96 # Write lots and lots of objects
97 write_objects 1 1 20 8000 $TEST_POOL
98
99 # Take down osd1
100 stop_osd 1
101
102 # Continue writing a lot of objects
103 write_objects 2 2 20 8000 $TEST_POOL
104
105 # Bring up osd1
106 restart_osd 1
107
108 # Finish peering.
109 sleep 15
110
111 # Stop osd0.
112 # At this point we have peered, but *NOT* recovered.
113 # Objects should be lost.
114 stop_osd 0
115
116 # Since recovery can't proceed, stuff should be unfound.
117 poll_cmd "./ceph pg debug unfound_objects_exist" TRUE 3 120
118 [ $? -eq 1 ] || die "Failed to see unfound objects."
119
120 pgs_unfound=`./ceph health detail |awk '$1 = "pg" && /[0-9] unfound$/ {print $2}'`
121
122 [ -n "$pgs_unfound" ] || die "no pg with unfound objects"
123
124 for pg in $pgs_unfound; do
125 ./ceph pg $pg mark_unfound_lost revert &&
126 die "mark_unfound_lost unexpectedly succeeded for pg $pg"
127 done
128
129 if ! is_set mark_osd_lost $flags && ! is_set rm_osd $flags; then
130 return
131 fi
132
133 if is_set try_to_fetch_unfound $flags; then
134 # Ask for an object while it's still unfound, and
135 # verify we get woken to an error when it's declared lost.
136 echo "trying to get one of the unfound objects"
137 (
138 ./rados -c ./ceph.conf -p $TEST_POOL get obj02 $TEMPDIR/obj02 &&\
139 die "expected radostool error"
140 ) &
141 fi
142
143 if is_set mark_osd_lost $flags; then
144 ./ceph osd lost 0 --yes-i-really-mean-it
145 fi
146
147 if is_set rm_osd $flags; then
148 ./ceph osd rm 0
149 fi
150
151 if ! is_set auto_mark_unfound_lost $flags; then
152 for pg in $pgs_unfound; do
153 ./ceph pg $pg mark_unfound_lost ${lost_action} ||
154 die "mark_unfound_lost failed for pg $pg"
155 done
156 fi
157
158 start_recovery 2
159
160 # Unfound objects go away and are turned into lost objects.
161 poll_cmd "./ceph pg debug unfound_objects_exist" FALSE 3 120
162 [ $? -eq 1 ] || die "Unfound objects didn't go away."
163
164 for pg in `ceph pg ls | awk '/^[0-9]/ {print $1}'`; do
165 ./ceph pg $pg mark_unfound_lost revert 2>&1 |
166 grep 'pg has no unfound objects' ||
167 die "pg $pg has unfound objects"
168 done
169
170 # Reading from a lost object gives back an error code.
171 # TODO: check error code
172 ./rados -c ./ceph.conf -p $TEST_POOL get obj01 $TEMPDIR/obj01
173 if [ lost_action = delete -a $? -eq 0 ]; then
174 die "expected radostool error"
175 elif [ lost_action = revert -a $? -ne 0 ]; then
176 die "unexpected radostool error"
177 fi
178
179 if is_set try_to_fetch_unfound $flags; then
180 echo "waiting for the try_to_fetch_unfound \
181 radostool instance to finish"
182 wait
183 fi
184 }
185
186 lost1() {
187 setup 2 'osd recovery delay start = 10000'
188 lost1_impl mark_osd_lost revert_lost
189 }
190
191 lost2() {
192 setup 2 'osd recovery delay start = 10000'
193 lost1_impl mark_osd_lost try_to_fetch_unfound
194 }
195
196 lost3() {
197 setup 2 'osd recovery delay start = 10000'
198 lost1_impl rm_osd
199 }
200
201 lost4() {
202 setup 2 'osd recovery delay start = 10000'
203 lost1_impl mark_osd_lost rm_osd
204 }
205
206 lost5() {
207 setup 2 'osd recovery delay start = 10000'
208 lost1_impl mark_osd_lost auto_mark_unfound_lost
209 }
210
211 all_osds_die_impl() {
212 poll_cmd "./ceph osd stat" '3 up, 3 in' 20 240
213 [ $? -eq 1 ] || die "didn't start 3 osds"
214
215 stop_osd 0
216 stop_osd 1
217 stop_osd 2
218
219 # wait for the MOSDPGStat timeout
220 poll_cmd "./ceph osd stat" '0 up' 20 240
221 [ $? -eq 1 ] || die "all osds weren't marked as down"
222 }
223
224 all_osds_die() {
225 setup 3 'osd mon report interval max = 60
226 osd mon report interval min = 3
227 mon osd report timeout = 60'
228
229 all_osds_die_impl
230 }
231
232 run() {
233 recovery1 || die "test failed"
234
235 lost1 || die "test failed"
236
237 # XXX: try_to_fetch_unfound test currently hangs on "waiting for the
238 # try_to_fetch_unfound radostool instance to finish"
239 #lost2 || die "test failed"
240
241 lost3 || die "test failed"
242
243 lost4 || die "test failed"
244
245 # XXX: automatically marking lost is not implemented
246 #lost5 || die "test failed"
247
248 all_osds_die || die "test failed"
249 }
250
251 if [ -z "$@" ]; then
252 run
253 echo OK
254 exit 0
255 fi
256
257 $@