]> git.proxmox.com Git - ceph.git/blame - ceph/qa/standalone/scrub/osd-recovery-scrub.sh
bump version to 18.2.2-pve1
[ceph.git] / ceph / qa / standalone / scrub / osd-recovery-scrub.sh
CommitLineData
11fdf7f2 1#! /usr/bin/env bash
b5b8bbf5
FG
2#
3# Copyright (C) 2017 Red Hat <contact@redhat.com>
4#
5# Author: David Zafman <dzafman@redhat.com>
6#
7# This program is free software; you can redistribute it and/or modify
8# it under the terms of the GNU Library Public License as published by
9# the Free Software Foundation; either version 2, or (at your option)
10# any later version.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU Library Public License for more details.
16#
17source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
18
19function run() {
20 local dir=$1
21 shift
22
23 export CEPH_MON="127.0.0.1:7124" # git grep '\<7124\>' : there must be only one
24 export CEPH_ARGS
25 CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
26 CEPH_ARGS+="--mon-host=$CEPH_MON "
27
11fdf7f2 28 export -n CEPH_CLI_TEST_DUP_COMMAND
b5b8bbf5
FG
29 local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
30 for func in $funcs ; do
31 $func $dir || return 1
32 done
33}
34
f67539c2
TL
35# Simple test for "not scheduling scrubs due to active recovery"
36# OSD::sched_scrub() called on all OSDs during ticks
37function TEST_recovery_scrub_1() {
b5b8bbf5
FG
38 local dir=$1
39 local poolname=test
40
f67539c2
TL
41 TESTDATA="testdata.$$"
42 OSDS=4
43 PGS=1
44 OBJECTS=100
45 ERRORS=0
46
47 setup $dir || return 1
48 run_mon $dir a --osd_pool_default_size=1 --mon_allow_pool_size_one=true \
49 --osd_scrub_interval_randomize_ratio=0.0 || return 1
50 run_mgr $dir x || return 1
51 for osd in $(seq 0 $(expr $OSDS - 1))
52 do
53 run_osd $dir $osd --osd_scrub_during_recovery=false || return 1
54 done
55
56 # Create a pool with $PGS pgs
57 create_pool $poolname $PGS $PGS
58 wait_for_clean || return 1
59 poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
60
61 ceph pg dump pgs
62
63 dd if=/dev/urandom of=$TESTDATA bs=1M count=50
64 for i in $(seq 1 $OBJECTS)
65 do
66 rados -p $poolname put obj${i} $TESTDATA
67 done
68 rm -f $TESTDATA
69
70 ceph osd pool set $poolname size 4
71
72 # Wait for recovery to start
73 set -o pipefail
74 count=0
75 while(true)
76 do
77 if ceph --format json pg dump pgs |
78 jq '.pg_stats | [.[] | .state | contains("recovering")]' | grep -q true
79 then
80 break
81 fi
82 sleep 2
83 if test "$count" -eq "10"
84 then
85 echo "Recovery never started"
86 return 1
87 fi
88 count=$(expr $count + 1)
89 done
90 set +o pipefail
91 ceph pg dump pgs
92
93 sleep 10
94 # Work around for http://tracker.ceph.com/issues/38195
95 kill_daemons $dir #|| return 1
96
97 declare -a err_strings
98 err_strings[0]="not scheduling scrubs due to active recovery"
99
100 for osd in $(seq 0 $(expr $OSDS - 1))
101 do
102 grep "not scheduling scrubs" $dir/osd.${osd}.log
103 done
104 for err_string in "${err_strings[@]}"
105 do
106 found=false
107 count=0
108 for osd in $(seq 0 $(expr $OSDS - 1))
109 do
110 if grep -q "$err_string" $dir/osd.${osd}.log
111 then
112 found=true
113 count=$(expr $count + 1)
114 fi
115 done
116 if [ "$found" = "false" ]; then
117 echo "Missing log message '$err_string'"
118 ERRORS=$(expr $ERRORS + 1)
119 fi
120 [ $count -eq $OSDS ] || return 1
121 done
122
123 teardown $dir || return 1
124
125 if [ $ERRORS != "0" ];
126 then
127 echo "TEST FAILED WITH $ERRORS ERRORS"
128 return 1
129 fi
130
131 echo "TEST PASSED"
132 return 0
133}
134
135##
136# a modified version of wait_for_scrub(), which terminates if the Primary
137# of the to-be-scrubbed PG changes
138#
139# Given the *last_scrub*, wait for scrub to happen on **pgid**. It
140# will fail if scrub does not complete within $TIMEOUT seconds. The
141# repair is complete whenever the **get_last_scrub_stamp** function
142# reports a timestamp different from the one given in argument.
143#
144# @param pgid the id of the PG
145# @param the primary OSD when started
146# @param last_scrub timestamp of the last scrub for *pgid*
147# @return 0 on success, 1 on error
148#
149function wait_for_scrub_mod() {
150 local pgid=$1
151 local orig_primary=$2
152 local last_scrub="$3"
153 local sname=${4:-last_scrub_stamp}
154
155 for ((i=0; i < $TIMEOUT; i++)); do
156 sleep 0.2
157 if test "$(get_last_scrub_stamp $pgid $sname)" '>' "$last_scrub" ; then
158 return 0
159 fi
160 sleep 1
161 # are we still the primary?
162 local current_primary=`bin/ceph pg $pgid query | jq '.acting[0]' `
163 if [ $orig_primary != $current_primary ]; then
164 echo $orig_primary no longer primary for $pgid
165 return 0
166 fi
167 done
168 return 1
169}
170
171##
172# A modified version of pg_scrub()
173#
174# Run scrub on **pgid** and wait until it completes. The pg_scrub
175# function will fail if repair does not complete within $TIMEOUT
176# seconds. The pg_scrub is complete whenever the
177# **get_last_scrub_stamp** function reports a timestamp different from
178# the one stored before starting the scrub, or whenever the Primary
179# changes.
180#
181# @param pgid the id of the PG
182# @return 0 on success, 1 on error
183#
184function pg_scrub_mod() {
185 local pgid=$1
186 local last_scrub=$(get_last_scrub_stamp $pgid)
187 # locate the primary
188 local my_primary=`bin/ceph pg $pgid query | jq '.acting[0]' `
20effc67 189 local recovery=false
f67539c2 190 ceph pg scrub $pgid
20effc67
TL
191 #ceph --format json pg dump pgs | jq ".pg_stats | .[] | select(.pgid == \"$pgid\") | .state"
192 if ceph --format json pg dump pgs | jq ".pg_stats | .[] | select(.pgid == \"$pgid\") | .state" | grep -q recovering
193 then
194 recovery=true
195 fi
196 wait_for_scrub_mod $pgid $my_primary "$last_scrub" || return 1
197 if test $recovery = "true"
198 then
199 return 2
200 fi
f67539c2
TL
201}
202
20effc67
TL
203# Same as wait_background() except that it checks for exit code 2 and bumps recov_scrub_count
204function wait_background_check() {
205 # We extract the PIDS from the variable name
206 pids=${!1}
207
208 return_code=0
209 for pid in $pids; do
210 wait $pid
211 retcode=$?
212 if test $retcode -eq 2
213 then
214 recov_scrub_count=$(expr $recov_scrub_count + 1)
215 elif test $retcode -ne 0
216 then
217 # If one process failed then return 1
218 return_code=1
219 fi
f67539c2 220 done
f67539c2 221
20effc67
TL
222 # We empty the variable reporting that all process ended
223 eval "$1=''"
f67539c2 224
20effc67 225 return $return_code
f67539c2
TL
226}
227
228# osd_scrub_during_recovery=true make sure scrub happens
229function TEST_recovery_scrub_2() {
230 local dir=$1
231 local poolname=test
f67539c2 232
b5b8bbf5
FG
233 TESTDATA="testdata.$$"
234 OSDS=8
235 PGS=32
20effc67 236 OBJECTS=40
b5b8bbf5
FG
237
238 setup $dir || return 1
f67539c2
TL
239 run_mon $dir a --osd_pool_default_size=1 --mon_allow_pool_size_one=true \
240 --osd_scrub_interval_randomize_ratio=0.0 || return 1
b5b8bbf5
FG
241 run_mgr $dir x || return 1
242 for osd in $(seq 0 $(expr $OSDS - 1))
243 do
20effc67 244 run_osd $dir $osd --osd_scrub_during_recovery=true --osd_recovery_sleep=10 || return 1
b5b8bbf5
FG
245 done
246
247 # Create a pool with $PGS pgs
248 create_pool $poolname $PGS $PGS
249 wait_for_clean || return 1
250 poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
251
252 dd if=/dev/urandom of=$TESTDATA bs=1M count=50
253 for i in $(seq 1 $OBJECTS)
254 do
255 rados -p $poolname put obj${i} $TESTDATA
256 done
257 rm -f $TESTDATA
258
f67539c2
TL
259 ceph osd pool set $poolname size 3
260
261 ceph pg dump pgs
262
263 # Wait for recovery to start
f67539c2
TL
264 count=0
265 while(true)
266 do
20effc67
TL
267 #ceph --format json pg dump pgs | jq '.pg_stats | [.[].state]'
268 if test $(ceph --format json pg dump pgs |
269 jq '.pg_stats | [.[].state]'| grep recovering | wc -l) -ge 2
f67539c2
TL
270 then
271 break
272 fi
f67539c2
TL
273 sleep 2
274 if test "$count" -eq "10"
275 then
20effc67 276 echo "Not enough recovery started simultaneously"
f67539c2
TL
277 return 1
278 fi
279 count=$(expr $count + 1)
280 done
f67539c2 281 ceph pg dump pgs
b5b8bbf5
FG
282
283 pids=""
20effc67 284 recov_scrub_count=0
b5b8bbf5
FG
285 for pg in $(seq 0 $(expr $PGS - 1))
286 do
f67539c2 287 run_in_background pids pg_scrub_mod $poolid.$(printf "%x" $pg)
b5b8bbf5 288 done
20effc67 289 wait_background_check pids
b5b8bbf5
FG
290 return_code=$?
291 if [ $return_code -ne 0 ]; then return $return_code; fi
292
293 ERRORS=0
20effc67
TL
294 if test $recov_scrub_count -eq 0
295 then
296 echo "No scrubs occurred while PG recovering"
297 ERRORS=$(expr $ERRORS + 1)
298 fi
299
b5b8bbf5
FG
300 pidfile=$(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid')
301 pid=$(cat $pidfile)
302 if ! kill -0 $pid
303 then
304 echo "OSD crash occurred"
f67539c2 305 #tail -100 $dir/osd.0.log
b5b8bbf5
FG
306 ERRORS=$(expr $ERRORS + 1)
307 fi
308
81eedcae
TL
309 # Work around for http://tracker.ceph.com/issues/38195
310 kill_daemons $dir #|| return 1
b5b8bbf5
FG
311
312 declare -a err_strings
313 err_strings[0]="not scheduling scrubs due to active recovery"
b5b8bbf5
FG
314
315 for osd in $(seq 0 $(expr $OSDS - 1))
316 do
f67539c2 317 grep "not scheduling scrubs" $dir/osd.${osd}.log
b5b8bbf5
FG
318 done
319 for err_string in "${err_strings[@]}"
320 do
321 found=false
322 for osd in $(seq 0 $(expr $OSDS - 1))
323 do
324 if grep "$err_string" $dir/osd.${osd}.log > /dev/null;
325 then
326 found=true
327 fi
328 done
f67539c2
TL
329 if [ "$found" = "true" ]; then
330 echo "Found log message not expected '$err_string'"
b5b8bbf5
FG
331 ERRORS=$(expr $ERRORS + 1)
332 fi
333 done
334
335 teardown $dir || return 1
336
337 if [ $ERRORS != "0" ];
338 then
339 echo "TEST FAILED WITH $ERRORS ERRORS"
340 return 1
341 fi
342
343 echo "TEST PASSED"
344 return 0
345}
346
347main osd-recovery-scrub "$@"
348
349# Local Variables:
350# compile-command: "cd build ; make -j4 && \
351# ../qa/run-standalone.sh osd-recovery-scrub.sh"
28e407b8 352# End: