]>
Commit | Line | Data |
---|---|---|
11fdf7f2 | 1 | #! /usr/bin/env bash |
b5b8bbf5 FG |
2 | # |
3 | # Copyright (C) 2017 Red Hat <contact@redhat.com> | |
4 | # | |
5 | # Author: David Zafman <dzafman@redhat.com> | |
6 | # | |
7 | # This program is free software; you can redistribute it and/or modify | |
8 | # it under the terms of the GNU Library Public License as published by | |
9 | # the Free Software Foundation; either version 2, or (at your option) | |
10 | # any later version. | |
11 | # | |
12 | # This program is distributed in the hope that it will be useful, | |
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 | # GNU Library Public License for more details. | |
16 | # | |
17 | source $CEPH_ROOT/qa/standalone/ceph-helpers.sh | |
18 | ||
19 | function run() { | |
20 | local dir=$1 | |
21 | shift | |
22 | ||
23 | export CEPH_MON="127.0.0.1:7124" # git grep '\<7124\>' : there must be only one | |
24 | export CEPH_ARGS | |
25 | CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " | |
26 | CEPH_ARGS+="--mon-host=$CEPH_MON " | |
27 | ||
11fdf7f2 | 28 | export -n CEPH_CLI_TEST_DUP_COMMAND |
b5b8bbf5 FG |
29 | local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} |
30 | for func in $funcs ; do | |
31 | $func $dir || return 1 | |
32 | done | |
33 | } | |
34 | ||
f67539c2 TL |
35 | # Simple test for "not scheduling scrubs due to active recovery" |
36 | # OSD::sched_scrub() called on all OSDs during ticks | |
37 | function TEST_recovery_scrub_1() { | |
b5b8bbf5 FG |
38 | local dir=$1 |
39 | local poolname=test | |
40 | ||
f67539c2 TL |
41 | TESTDATA="testdata.$$" |
42 | OSDS=4 | |
43 | PGS=1 | |
44 | OBJECTS=100 | |
45 | ERRORS=0 | |
46 | ||
47 | setup $dir || return 1 | |
48 | run_mon $dir a --osd_pool_default_size=1 --mon_allow_pool_size_one=true \ | |
49 | --osd_scrub_interval_randomize_ratio=0.0 || return 1 | |
50 | run_mgr $dir x || return 1 | |
51 | for osd in $(seq 0 $(expr $OSDS - 1)) | |
52 | do | |
53 | run_osd $dir $osd --osd_scrub_during_recovery=false || return 1 | |
54 | done | |
55 | ||
56 | # Create a pool with $PGS pgs | |
57 | create_pool $poolname $PGS $PGS | |
58 | wait_for_clean || return 1 | |
59 | poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }') | |
60 | ||
61 | ceph pg dump pgs | |
62 | ||
63 | dd if=/dev/urandom of=$TESTDATA bs=1M count=50 | |
64 | for i in $(seq 1 $OBJECTS) | |
65 | do | |
66 | rados -p $poolname put obj${i} $TESTDATA | |
67 | done | |
68 | rm -f $TESTDATA | |
69 | ||
70 | ceph osd pool set $poolname size 4 | |
71 | ||
72 | # Wait for recovery to start | |
73 | set -o pipefail | |
74 | count=0 | |
75 | while(true) | |
76 | do | |
77 | if ceph --format json pg dump pgs | | |
78 | jq '.pg_stats | [.[] | .state | contains("recovering")]' | grep -q true | |
79 | then | |
80 | break | |
81 | fi | |
82 | sleep 2 | |
83 | if test "$count" -eq "10" | |
84 | then | |
85 | echo "Recovery never started" | |
86 | return 1 | |
87 | fi | |
88 | count=$(expr $count + 1) | |
89 | done | |
90 | set +o pipefail | |
91 | ceph pg dump pgs | |
92 | ||
93 | sleep 10 | |
94 | # Work around for http://tracker.ceph.com/issues/38195 | |
95 | kill_daemons $dir #|| return 1 | |
96 | ||
97 | declare -a err_strings | |
98 | err_strings[0]="not scheduling scrubs due to active recovery" | |
99 | ||
100 | for osd in $(seq 0 $(expr $OSDS - 1)) | |
101 | do | |
102 | grep "not scheduling scrubs" $dir/osd.${osd}.log | |
103 | done | |
104 | for err_string in "${err_strings[@]}" | |
105 | do | |
106 | found=false | |
107 | count=0 | |
108 | for osd in $(seq 0 $(expr $OSDS - 1)) | |
109 | do | |
110 | if grep -q "$err_string" $dir/osd.${osd}.log | |
111 | then | |
112 | found=true | |
113 | count=$(expr $count + 1) | |
114 | fi | |
115 | done | |
116 | if [ "$found" = "false" ]; then | |
117 | echo "Missing log message '$err_string'" | |
118 | ERRORS=$(expr $ERRORS + 1) | |
119 | fi | |
120 | [ $count -eq $OSDS ] || return 1 | |
121 | done | |
122 | ||
123 | teardown $dir || return 1 | |
124 | ||
125 | if [ $ERRORS != "0" ]; | |
126 | then | |
127 | echo "TEST FAILED WITH $ERRORS ERRORS" | |
128 | return 1 | |
129 | fi | |
130 | ||
131 | echo "TEST PASSED" | |
132 | return 0 | |
133 | } | |
134 | ||
135 | ## | |
136 | # a modified version of wait_for_scrub(), which terminates if the Primary | |
137 | # of the to-be-scrubbed PG changes | |
138 | # | |
139 | # Given the *last_scrub*, wait for scrub to happen on **pgid**. It | |
140 | # will fail if scrub does not complete within $TIMEOUT seconds. The | |
141 | # repair is complete whenever the **get_last_scrub_stamp** function | |
142 | # reports a timestamp different from the one given in argument. | |
143 | # | |
144 | # @param pgid the id of the PG | |
145 | # @param the primary OSD when started | |
146 | # @param last_scrub timestamp of the last scrub for *pgid* | |
147 | # @return 0 on success, 1 on error | |
148 | # | |
149 | function wait_for_scrub_mod() { | |
150 | local pgid=$1 | |
151 | local orig_primary=$2 | |
152 | local last_scrub="$3" | |
153 | local sname=${4:-last_scrub_stamp} | |
154 | ||
155 | for ((i=0; i < $TIMEOUT; i++)); do | |
156 | sleep 0.2 | |
157 | if test "$(get_last_scrub_stamp $pgid $sname)" '>' "$last_scrub" ; then | |
158 | return 0 | |
159 | fi | |
160 | sleep 1 | |
161 | # are we still the primary? | |
162 | local current_primary=`bin/ceph pg $pgid query | jq '.acting[0]' ` | |
163 | if [ $orig_primary != $current_primary ]; then | |
164 | echo $orig_primary no longer primary for $pgid | |
165 | return 0 | |
166 | fi | |
167 | done | |
168 | return 1 | |
169 | } | |
170 | ||
171 | ## | |
172 | # A modified version of pg_scrub() | |
173 | # | |
174 | # Run scrub on **pgid** and wait until it completes. The pg_scrub | |
175 | # function will fail if repair does not complete within $TIMEOUT | |
176 | # seconds. The pg_scrub is complete whenever the | |
177 | # **get_last_scrub_stamp** function reports a timestamp different from | |
178 | # the one stored before starting the scrub, or whenever the Primary | |
179 | # changes. | |
180 | # | |
181 | # @param pgid the id of the PG | |
182 | # @return 0 on success, 1 on error | |
183 | # | |
184 | function pg_scrub_mod() { | |
185 | local pgid=$1 | |
186 | local last_scrub=$(get_last_scrub_stamp $pgid) | |
187 | # locate the primary | |
188 | local my_primary=`bin/ceph pg $pgid query | jq '.acting[0]' ` | |
20effc67 | 189 | local recovery=false |
f67539c2 | 190 | ceph pg scrub $pgid |
20effc67 TL |
191 | #ceph --format json pg dump pgs | jq ".pg_stats | .[] | select(.pgid == \"$pgid\") | .state" |
192 | if ceph --format json pg dump pgs | jq ".pg_stats | .[] | select(.pgid == \"$pgid\") | .state" | grep -q recovering | |
193 | then | |
194 | recovery=true | |
195 | fi | |
196 | wait_for_scrub_mod $pgid $my_primary "$last_scrub" || return 1 | |
197 | if test $recovery = "true" | |
198 | then | |
199 | return 2 | |
200 | fi | |
f67539c2 TL |
201 | } |
202 | ||
20effc67 TL |
203 | # Same as wait_background() except that it checks for exit code 2 and bumps recov_scrub_count |
204 | function wait_background_check() { | |
205 | # We extract the PIDS from the variable name | |
206 | pids=${!1} | |
207 | ||
208 | return_code=0 | |
209 | for pid in $pids; do | |
210 | wait $pid | |
211 | retcode=$? | |
212 | if test $retcode -eq 2 | |
213 | then | |
214 | recov_scrub_count=$(expr $recov_scrub_count + 1) | |
215 | elif test $retcode -ne 0 | |
216 | then | |
217 | # If one process failed then return 1 | |
218 | return_code=1 | |
219 | fi | |
f67539c2 | 220 | done |
f67539c2 | 221 | |
20effc67 TL |
222 | # We empty the variable reporting that all process ended |
223 | eval "$1=''" | |
f67539c2 | 224 | |
20effc67 | 225 | return $return_code |
f67539c2 TL |
226 | } |
227 | ||
228 | # osd_scrub_during_recovery=true make sure scrub happens | |
229 | function TEST_recovery_scrub_2() { | |
230 | local dir=$1 | |
231 | local poolname=test | |
f67539c2 | 232 | |
b5b8bbf5 FG |
233 | TESTDATA="testdata.$$" |
234 | OSDS=8 | |
235 | PGS=32 | |
20effc67 | 236 | OBJECTS=40 |
b5b8bbf5 FG |
237 | |
238 | setup $dir || return 1 | |
f67539c2 TL |
239 | run_mon $dir a --osd_pool_default_size=1 --mon_allow_pool_size_one=true \ |
240 | --osd_scrub_interval_randomize_ratio=0.0 || return 1 | |
b5b8bbf5 FG |
241 | run_mgr $dir x || return 1 |
242 | for osd in $(seq 0 $(expr $OSDS - 1)) | |
243 | do | |
20effc67 | 244 | run_osd $dir $osd --osd_scrub_during_recovery=true --osd_recovery_sleep=10 || return 1 |
b5b8bbf5 FG |
245 | done |
246 | ||
247 | # Create a pool with $PGS pgs | |
248 | create_pool $poolname $PGS $PGS | |
249 | wait_for_clean || return 1 | |
250 | poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }') | |
251 | ||
252 | dd if=/dev/urandom of=$TESTDATA bs=1M count=50 | |
253 | for i in $(seq 1 $OBJECTS) | |
254 | do | |
255 | rados -p $poolname put obj${i} $TESTDATA | |
256 | done | |
257 | rm -f $TESTDATA | |
258 | ||
f67539c2 TL |
259 | ceph osd pool set $poolname size 3 |
260 | ||
261 | ceph pg dump pgs | |
262 | ||
263 | # Wait for recovery to start | |
f67539c2 TL |
264 | count=0 |
265 | while(true) | |
266 | do | |
20effc67 TL |
267 | #ceph --format json pg dump pgs | jq '.pg_stats | [.[].state]' |
268 | if test $(ceph --format json pg dump pgs | | |
269 | jq '.pg_stats | [.[].state]'| grep recovering | wc -l) -ge 2 | |
f67539c2 TL |
270 | then |
271 | break | |
272 | fi | |
f67539c2 TL |
273 | sleep 2 |
274 | if test "$count" -eq "10" | |
275 | then | |
20effc67 | 276 | echo "Not enough recovery started simultaneously" |
f67539c2 TL |
277 | return 1 |
278 | fi | |
279 | count=$(expr $count + 1) | |
280 | done | |
f67539c2 | 281 | ceph pg dump pgs |
b5b8bbf5 FG |
282 | |
283 | pids="" | |
20effc67 | 284 | recov_scrub_count=0 |
b5b8bbf5 FG |
285 | for pg in $(seq 0 $(expr $PGS - 1)) |
286 | do | |
f67539c2 | 287 | run_in_background pids pg_scrub_mod $poolid.$(printf "%x" $pg) |
b5b8bbf5 | 288 | done |
20effc67 | 289 | wait_background_check pids |
b5b8bbf5 FG |
290 | return_code=$? |
291 | if [ $return_code -ne 0 ]; then return $return_code; fi | |
292 | ||
293 | ERRORS=0 | |
20effc67 TL |
294 | if test $recov_scrub_count -eq 0 |
295 | then | |
296 | echo "No scrubs occurred while PG recovering" | |
297 | ERRORS=$(expr $ERRORS + 1) | |
298 | fi | |
299 | ||
b5b8bbf5 FG |
300 | pidfile=$(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid') |
301 | pid=$(cat $pidfile) | |
302 | if ! kill -0 $pid | |
303 | then | |
304 | echo "OSD crash occurred" | |
f67539c2 | 305 | #tail -100 $dir/osd.0.log |
b5b8bbf5 FG |
306 | ERRORS=$(expr $ERRORS + 1) |
307 | fi | |
308 | ||
81eedcae TL |
309 | # Work around for http://tracker.ceph.com/issues/38195 |
310 | kill_daemons $dir #|| return 1 | |
b5b8bbf5 FG |
311 | |
312 | declare -a err_strings | |
313 | err_strings[0]="not scheduling scrubs due to active recovery" | |
b5b8bbf5 FG |
314 | |
315 | for osd in $(seq 0 $(expr $OSDS - 1)) | |
316 | do | |
f67539c2 | 317 | grep "not scheduling scrubs" $dir/osd.${osd}.log |
b5b8bbf5 FG |
318 | done |
319 | for err_string in "${err_strings[@]}" | |
320 | do | |
321 | found=false | |
322 | for osd in $(seq 0 $(expr $OSDS - 1)) | |
323 | do | |
324 | if grep "$err_string" $dir/osd.${osd}.log > /dev/null; | |
325 | then | |
326 | found=true | |
327 | fi | |
328 | done | |
f67539c2 TL |
329 | if [ "$found" = "true" ]; then |
330 | echo "Found log message not expected '$err_string'" | |
b5b8bbf5 FG |
331 | ERRORS=$(expr $ERRORS + 1) |
332 | fi | |
333 | done | |
334 | ||
335 | teardown $dir || return 1 | |
336 | ||
337 | if [ $ERRORS != "0" ]; | |
338 | then | |
339 | echo "TEST FAILED WITH $ERRORS ERRORS" | |
340 | return 1 | |
341 | fi | |
342 | ||
343 | echo "TEST PASSED" | |
344 | return 0 | |
345 | } | |
346 | ||
347 | main osd-recovery-scrub "$@" | |
348 | ||
349 | # Local Variables: | |
350 | # compile-command: "cd build ; make -j4 && \ | |
351 | # ../qa/run-standalone.sh osd-recovery-scrub.sh" | |
28e407b8 | 352 | # End: |