]> git.proxmox.com Git - ceph.git/blob - ceph/qa/standalone/osd/osd-rep-recov-eio.sh
bump version to 15.2.4-pve1
[ceph.git] / ceph / qa / standalone / osd / osd-rep-recov-eio.sh
1 #!/usr/bin/env bash
2 #
3 # Copyright (C) 2017 Red Hat <contact@redhat.com>
4 #
5 #
6 # Author: Kefu Chai <kchai@redhat.com>
7 # Author: David Zafman <dzafman@redhat.com>
8 #
9 # This program is free software; you can redistribute it and/or modify
10 # it under the terms of the GNU Library Public License as published by
11 # the Free Software Foundation; either version 2, or (at your option)
12 # any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU Library Public License for more details.
18 #
19
20 source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
21
22 function run() {
23 local dir=$1
24 shift
25
26 export CEPH_MON="127.0.0.1:7140" # git grep '\<7140\>' : there must be only one
27 export CEPH_ARGS
28 CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
29 CEPH_ARGS+="--mon-host=$CEPH_MON "
30
31
32 local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
33 for func in $funcs ; do
34 setup $dir || return 1
35 run_mon $dir a || return 1
36 run_mgr $dir x || return 1
37 ceph osd pool create foo 8 || return 1
38
39 $func $dir || return 1
40 teardown $dir || return 1
41 done
42 }
43
44 function setup_osds() {
45 local count=$1
46 shift
47 local type=$1
48
49 for id in $(seq 0 $(expr $count - 1)) ; do
50 run_osd${type} $dir $id || return 1
51 done
52 wait_for_clean || return 1
53 }
54
55 function get_state() {
56 local pgid=$1
57 local sname=state
58 ceph --format json pg dump pgs 2>/dev/null | \
59 jq -r ".pg_stats | .[] | select(.pgid==\"$pgid\") | .$sname"
60 }
61
62 function rados_put() {
63 local dir=$1
64 local poolname=$2
65 local objname=${3:-SOMETHING}
66
67 for marker in AAA BBB CCCC DDDD ; do
68 printf "%*s" 1024 $marker
69 done > $dir/ORIGINAL
70 #
71 # get and put an object, compare they are equal
72 #
73 rados --pool $poolname put $objname $dir/ORIGINAL || return 1
74 }
75
76 function rados_get() {
77 local dir=$1
78 local poolname=$2
79 local objname=${3:-SOMETHING}
80 local expect=${4:-ok}
81
82 #
83 # Expect a failure to get object
84 #
85 if [ $expect = "fail" ];
86 then
87 ! rados --pool $poolname get $objname $dir/COPY
88 return
89 fi
90 #
91 # Expect hang trying to get object
92 #
93 if [ $expect = "hang" ];
94 then
95 timeout 5 rados --pool $poolname get $objname $dir/COPY
96 test "$?" = "124"
97 return
98 fi
99 #
100 # get an object, compare with $dir/ORIGINAL
101 #
102 rados --pool $poolname get $objname $dir/COPY || return 1
103 diff $dir/ORIGINAL $dir/COPY || return 1
104 rm $dir/COPY
105 }
106
107 function rados_get_data() {
108 local inject=$1
109 shift
110 local dir=$1
111
112 local poolname=pool-rep
113 local objname=obj-$inject-$$
114 local pgid=$(get_pg $poolname $objname)
115
116 rados_put $dir $poolname $objname || return 1
117 inject_$inject rep data $poolname $objname $dir 0 || return 1
118 rados_get $dir $poolname $objname || return 1
119
120 COUNT=$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_objects_repaired')
121 test "$COUNT" = "1" || return 1
122 flush_pg_stats
123 COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired")
124 test "$COUNT" = "1" || return 1
125
126 local object_osds=($(get_osds $poolname $objname))
127 local primary=${object_osds[0]}
128 local bad_peer=${object_osds[1]}
129 inject_$inject rep data $poolname $objname $dir 0 || return 1
130 inject_$inject rep data $poolname $objname $dir 1 || return 1
131 # Force primary to pull from the bad peer, so we can repair it too!
132 set_config osd $primary osd_debug_feed_pullee $bad_peer || return 1
133 rados_get $dir $poolname $objname || return 1
134
135 # Wait until automatic repair of bad peer is done
136 wait_for_clean || return 1
137
138 inject_$inject rep data $poolname $objname $dir 0 || return 1
139 inject_$inject rep data $poolname $objname $dir 2 || return 1
140 rados_get $dir $poolname $objname || return 1
141
142 COUNT=$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_objects_repaired')
143 test "$COUNT" = "3" || return 1
144 flush_pg_stats
145 COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired")
146 test "$COUNT" = "4" || return 1
147
148 inject_$inject rep data $poolname $objname $dir 0 || return 1
149 inject_$inject rep data $poolname $objname $dir 1 || return 1
150 inject_$inject rep data $poolname $objname $dir 2 || return 1
151 rados_get $dir $poolname $objname hang || return 1
152
153 # After hang another repair couldn't happen, so count stays the same
154 COUNT=$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_objects_repaired')
155 test "$COUNT" = "3" || return 1
156 flush_pg_stats
157 COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired")
158 test "$COUNT" = "4" || return 1
159 }
160
161 function TEST_rados_get_with_eio() {
162 local dir=$1
163
164 setup_osds 4 || return 1
165
166 local poolname=pool-rep
167 create_pool $poolname 1 1 || return 1
168 wait_for_clean || return 1
169 rados_get_data eio $dir || return 1
170
171 delete_pool $poolname
172 }
173
174 # Test backfill with unfound object
175 function TEST_rep_backfill_unfound() {
176 local dir=$1
177 local objname=myobject
178 local lastobj=300
179 # Must be between 1 and $lastobj
180 local testobj=obj250
181
182 export CEPH_ARGS
183 CEPH_ARGS+=' --osd_min_pg_log_entries=5 --osd_max_pg_log_entries=10'
184 setup_osds 3 || return 1
185
186 local poolname=test-pool
187 create_pool $poolname 1 1 || return 1
188 wait_for_clean || return 1
189
190 ceph pg dump pgs
191
192 rados_put $dir $poolname $objname || return 1
193
194 local -a initial_osds=($(get_osds $poolname $objname))
195 local last_osd=${initial_osds[-1]}
196 kill_daemons $dir TERM osd.${last_osd} 2>&2 < /dev/null || return 1
197 ceph osd down ${last_osd} || return 1
198 ceph osd out ${last_osd} || return 1
199
200 ceph pg dump pgs
201
202 dd if=/dev/urandom of=${dir}/ORIGINAL bs=1024 count=4
203 for i in $(seq 1 $lastobj)
204 do
205 rados --pool $poolname put obj${i} $dir/ORIGINAL || return 1
206 done
207
208 inject_eio rep data $poolname $testobj $dir 0 || return 1
209 inject_eio rep data $poolname $testobj $dir 1 || return 1
210
211 activate_osd $dir ${last_osd} || return 1
212 ceph osd in ${last_osd} || return 1
213
214 sleep 15
215
216 for tmp in $(seq 1 100); do
217 state=$(get_state 2.0)
218 echo $state | grep backfill_unfound
219 if [ "$?" = "0" ]; then
220 break
221 fi
222 echo "$state "
223 sleep 1
224 done
225
226 ceph pg dump pgs
227 ceph pg 2.0 list_unfound | grep -q $testobj || return 1
228
229 # Command should hang because object is unfound
230 timeout 5 rados -p $poolname get $testobj $dir/CHECK
231 test $? = "124" || return 1
232
233 ceph pg 2.0 mark_unfound_lost delete
234
235 wait_for_clean || return 1
236
237 for i in $(seq 1 $lastobj)
238 do
239 if [ obj${i} = "$testobj" ]; then
240 # Doesn't exist anymore
241 ! rados -p $poolname get $testobj $dir/CHECK || return 1
242 else
243 rados --pool $poolname get obj${i} $dir/CHECK || return 1
244 diff -q $dir/ORIGINAL $dir/CHECK || return 1
245 fi
246 done
247
248 rm -f ${dir}/ORIGINAL ${dir}/CHECK
249
250 delete_pool $poolname
251 }
252
253 # Test recovery with unfound object
254 function TEST_rep_recovery_unfound() {
255 local dir=$1
256 local objname=myobject
257 local lastobj=100
258 # Must be between 1 and $lastobj
259 local testobj=obj75
260
261 setup_osds 3 || return 1
262
263 local poolname=test-pool
264 create_pool $poolname 1 1 || return 1
265 wait_for_clean || return 1
266
267 ceph pg dump pgs
268
269 rados_put $dir $poolname $objname || return 1
270
271 local -a initial_osds=($(get_osds $poolname $objname))
272 local last_osd=${initial_osds[-1]}
273 kill_daemons $dir TERM osd.${last_osd} 2>&2 < /dev/null || return 1
274 ceph osd down ${last_osd} || return 1
275 ceph osd out ${last_osd} || return 1
276
277 ceph pg dump pgs
278
279 dd if=/dev/urandom of=${dir}/ORIGINAL bs=1024 count=4
280 for i in $(seq 1 $lastobj)
281 do
282 rados --pool $poolname put obj${i} $dir/ORIGINAL || return 1
283 done
284
285 inject_eio rep data $poolname $testobj $dir 0 || return 1
286 inject_eio rep data $poolname $testobj $dir 1 || return 1
287
288 activate_osd $dir ${last_osd} || return 1
289 ceph osd in ${last_osd} || return 1
290
291 sleep 15
292
293 for tmp in $(seq 1 100); do
294 state=$(get_state 2.0)
295 echo $state | grep -v recovering
296 if [ "$?" = "0" ]; then
297 break
298 fi
299 echo "$state "
300 sleep 1
301 done
302
303 ceph pg dump pgs
304 ceph pg 2.0 list_unfound | grep -q $testobj || return 1
305
306 # Command should hang because object is unfound
307 timeout 5 rados -p $poolname get $testobj $dir/CHECK
308 test $? = "124" || return 1
309
310 ceph pg 2.0 mark_unfound_lost delete
311
312 wait_for_clean || return 1
313
314 for i in $(seq 1 $lastobj)
315 do
316 if [ obj${i} = "$testobj" ]; then
317 # Doesn't exist anymore
318 ! rados -p $poolname get $testobj $dir/CHECK || return 1
319 else
320 rados --pool $poolname get obj${i} $dir/CHECK || return 1
321 diff -q $dir/ORIGINAL $dir/CHECK || return 1
322 fi
323 done
324
325 rm -f ${dir}/ORIGINAL ${dir}/CHECK
326
327 delete_pool $poolname
328 }
329
330 # This is a filestore only test because it requires data digest in object info
331 function TEST_rep_read_unfound() {
332 local dir=$1
333 local objname=myobject
334
335 setup_osds 3 _filestore || return 1
336
337 ceph osd pool delete foo foo --yes-i-really-really-mean-it || return 1
338 local poolname=test-pool
339 create_pool $poolname 1 1 || return 1
340 ceph osd pool set $poolname size 2
341 wait_for_clean || return 1
342
343 ceph pg dump pgs
344
345 dd if=/dev/urandom bs=8k count=1 of=$dir/ORIGINAL
346 rados -p $poolname put $objname $dir/ORIGINAL
347
348 local primary=$(get_primary $poolname $objname)
349 local other=$(get_not_primary $poolname $objname)
350
351 dd if=/dev/urandom bs=8k count=1 of=$dir/CORRUPT
352 objectstore_tool $dir $primary $objname set-bytes $dir/CORRUPT || return 1
353 objectstore_tool $dir $other $objname set-bytes $dir/CORRUPT || return 1
354
355 timeout 30 rados -p $poolname get $objname $dir/tmp &
356
357 sleep 5
358
359 flush_pg_stats
360 ceph --format=json pg dump pgs | jq '.'
361
362 if ! ceph --format=json pg dump pgs | jq '.pg_stats | .[0].state' | grep -q recovery_unfound
363 then
364 echo "Failure to get to recovery_unfound state"
365 return 1
366 fi
367
368 objectstore_tool $dir $other $objname set-bytes $dir/ORIGINAL || return 1
369
370 wait
371
372 if ! cmp $dir/ORIGINAL $dir/tmp
373 then
374 echo "Bad data after primary repair"
375 return 1
376 fi
377 }
378
379 main osd-rep-recov-eio.sh "$@"
380
381 # Local Variables:
382 # compile-command: "cd ../../../build ; make -j4 && ../qa/run-standalone.sh osd-rep-recov-eio.sh"
383 # End: