3 # Copyright (C) 2018 Red Hat <contact@redhat.com>
5 # Author: David Zafman <dzafman@redhat.com>
7 # This program is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU Library Public License as published by
9 # the Free Software Foundation; either version 2, or (at your option)
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Library Public License for more details.
17 source $CEPH_ROOT/qa
/standalone
/ceph-helpers.sh
18 source $CEPH_ROOT/qa
/standalone
/scrub
/scrub-helpers.sh
24 export CEPH_MON
="127.0.0.1:7138" # git grep '\<7138\>' : there must be only one
26 CEPH_ARGS
+="--fsid=$(uuidgen) --auth-supported=none "
27 CEPH_ARGS
+="--mon-host=$CEPH_MON "
29 export -n CEPH_CLI_TEST_DUP_COMMAND
30 local funcs
=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
31 for func
in $funcs ; do
32 setup
$dir ||
return 1
33 $func $dir ||
return 1
34 teardown
$dir ||
return 1
38 function TEST_scrub_test
() {
44 TESTDATA
="testdata.$$"
46 run_mon
$dir a
--osd_pool_default_size=3 ||
return 1
47 run_mgr
$dir x ||
return 1
48 for osd
in $
(seq 0 $
(expr $OSDS - 1))
50 run_osd
$dir $osd ||
return 1
53 # Create a pool with a single pg
54 create_pool
$poolname 1 1
55 wait_for_clean ||
return 1
56 poolid
=$
(ceph osd dump |
grep "^pool.*[']${poolname}[']" |
awk '{ print $2 }')
58 dd if=/dev
/urandom of
=$TESTDATA bs
=1032 count
=1
59 for i
in `seq 1 $objects`
61 rados
-p $poolname put obj
${i} $TESTDATA
65 local primary
=$
(get_primary
$poolname obj1
)
66 local otherosd
=$
(get_not_primary
$poolname obj1
)
67 if [ "$otherosd" = "2" ];
74 objectstore_tool
$dir $anotherosd obj1 set-bytes
/etc
/fstab
76 local pgid
="${poolid}.0"
77 pg_deep_scrub
"$pgid" ||
return 1
79 ceph pg dump pgs |
grep ^
${pgid} |
grep -q -- +inconsistent ||
return 1
80 test "$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_scrub_errors')" = "2" ||
return 1
83 wait_for_clean ||
return 1
85 pg_deep_scrub
"$pgid" ||
return 1
87 test "$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_scrub_errors')" = "2" ||
return 1
88 test "$(ceph pg $pgid query | jq '.peer_info[0].stats.stat_sum.num_scrub_errors')" = "2" ||
return 1
89 ceph pg dump pgs |
grep ^
${pgid} |
grep -q -- +inconsistent ||
return 1
92 wait_for_clean ||
return 1
94 repair
"$pgid" ||
return 1
95 wait_for_clean ||
return 1
97 # This sets up the test after we've repaired with previous primary has old value
98 test "$(ceph pg $pgid query | jq '.peer_info[0].stats.stat_sum.num_scrub_errors')" = "2" ||
return 1
99 ceph pg dump pgs |
grep ^
${pgid} |
grep -vq -- +inconsistent ||
return 1
101 ceph osd out
$primary
102 wait_for_clean ||
return 1
104 test "$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_scrub_errors')" = "0" ||
return 1
105 test "$(ceph pg $pgid query | jq '.peer_info[0].stats.stat_sum.num_scrub_errors')" = "0" ||
return 1
106 test "$(ceph pg $pgid query | jq '.peer_info[1].stats.stat_sum.num_scrub_errors')" = "0" ||
return 1
107 ceph pg dump pgs |
grep ^
${pgid} |
grep -vq -- +inconsistent ||
return 1
110 # Grab year-month-day
111 DATESED
="s/\([0-9]*-[0-9]*-[0-9]*\).*/\1/"
112 DATEFORMAT
="%Y-%m-%d"
114 function check_dump_scrubs
() {
116 local sched_time_check
="$2"
117 local deadline_check
="$3"
119 DS
="$(CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${primary}) dump_scrubs)"
120 # use eval to drop double-quotes
121 eval SCHED_TIME
=$
(echo $DS | jq
'.[0].sched_time')
122 test $
(echo $SCHED_TIME |
sed $DATESED) = $
(date +${DATEFORMAT} -d "now + $sched_time_check") ||
return 1
123 # use eval to drop double-quotes
124 eval DEADLINE
=$
(echo $DS | jq
'.[0].deadline')
125 test $
(echo $DEADLINE |
sed $DATESED) = $
(date +${DATEFORMAT} -d "now + $deadline_check") ||
return 1
128 function TEST_interval_changes
() {
132 # Don't assume how internal defaults are set
133 local day
="$(expr 24 \* 60 \* 60)"
134 local week
="$(expr $day \* 7)"
135 local min_interval
=$day
136 local max_interval
=$week
137 local WAIT_FOR_UPDATE
=15
139 TESTDATA
="testdata.$$"
141 # This min scrub interval results in 30 seconds backoff time
142 run_mon
$dir a
--osd_pool_default_size=$OSDS ||
return 1
143 run_mgr
$dir x ||
return 1
144 for osd
in $
(seq 0 $
(expr $OSDS - 1))
146 run_osd
$dir $osd --osd_scrub_min_interval=$min_interval --osd_scrub_max_interval=$max_interval --osd_scrub_interval_randomize_ratio=0 ||
return 1
149 # Create a pool with a single pg
150 create_pool
$poolname 1 1
151 wait_for_clean ||
return 1
152 local poolid
=$
(ceph osd dump |
grep "^pool.*[']${poolname}[']" |
awk '{ print $2 }')
154 dd if=/dev
/urandom of
=$TESTDATA bs
=1032 count
=1
155 for i
in `seq 1 $objects`
157 rados
-p $poolname put obj
${i} $TESTDATA
161 local primary
=$
(get_primary
$poolname obj1
)
163 # Check initial settings from above (min 1 day, min 1 week)
164 check_dump_scrubs
$primary "1 day" "1 week" ||
return 1
166 # Change global osd_scrub_min_interval to 2 days
167 CEPH_ARGS
='' ceph
--admin-daemon $
(get_asok_path osd.
${primary}) config
set osd_scrub_min_interval $
(expr $day \
* 2)
168 sleep $WAIT_FOR_UPDATE
169 check_dump_scrubs
$primary "2 days" "1 week" ||
return 1
171 # Change global osd_scrub_max_interval to 2 weeks
172 CEPH_ARGS
='' ceph
--admin-daemon $
(get_asok_path osd.
${primary}) config
set osd_scrub_max_interval $
(expr $week \
* 2)
173 sleep $WAIT_FOR_UPDATE
174 check_dump_scrubs
$primary "2 days" "2 week" ||
return 1
176 # Change pool osd_scrub_min_interval to 3 days
177 ceph osd pool
set $poolname scrub_min_interval $
(expr $day \
* 3)
178 sleep $WAIT_FOR_UPDATE
179 check_dump_scrubs
$primary "3 days" "2 week" ||
return 1
181 # Change pool osd_scrub_max_interval to 3 weeks
182 ceph osd pool
set $poolname scrub_max_interval $
(expr $week \
* 3)
183 sleep $WAIT_FOR_UPDATE
184 check_dump_scrubs
$primary "3 days" "3 week" ||
return 1
187 function TEST_scrub_extended_sleep
() {
193 TESTDATA
="testdata.$$"
197 if [ "$DAY" -ge "4" ];
201 # Start after 2 days in case we are near midnight
202 DAY_START
=$
(expr $DAY + 2)
203 DAY_END
=$
(expr $DAY + 3)
205 run_mon
$dir a
--osd_pool_default_size=3 ||
return 1
206 run_mgr
$dir x ||
return 1
207 for osd
in $
(seq 0 $
(expr $OSDS - 1))
209 run_osd
$dir $osd --osd_scrub_sleep=0 \
210 --osd_scrub_extended_sleep=20 \
211 --bluestore_cache_autotune=false \
212 --osd_deep_scrub_randomize_ratio=0.0 \
213 --osd_scrub_interval_randomize_ratio=0 \
214 --osd_scrub_begin_week_day=$DAY_START \
215 --osd_scrub_end_week_day=$DAY_END \
219 # Create a pool with a single pg
220 create_pool
$poolname 1 1
221 wait_for_clean ||
return 1
223 # Trigger a scrub on a PG
224 local pgid
=$
(get_pg
$poolname SOMETHING
)
225 local primary
=$
(get_primary
$poolname SOMETHING
)
226 local last_scrub
=$
(get_last_scrub_stamp
$pgid)
227 ceph tell
$pgid scrub ||
return 1
229 # Allow scrub to start extended sleep
231 for ((i
=0; i
< 15; i
++)); do
232 if grep -q "scrub state.*, sleeping" $dir/osd.
${primary}.log
240 # Check that extended sleep was triggered
241 if [ $PASSED = "false" ];
246 # release scrub to run after extended sleep finishes
247 ceph tell osd.
$primary config
set osd_scrub_begin_week_day
0
248 ceph tell osd.
$primary config
set osd_scrub_end_week_day
0
250 # Due to extended sleep, the scrub should not be done within 20 seconds
251 # but test up to 10 seconds and make sure it happens by 25 seconds.
254 for ((i
=0; i
< 25; i
++)); do
255 count
=$
(expr $count + 1)
256 if test "$(get_last_scrub_stamp $pgid)" '>' "$last_scrub" ; then
257 # Did scrub run too soon?
258 if [ $count -lt "10" ];
268 # Make sure scrub eventually ran
269 if [ $PASSED = "false" ];
275 function _scrub_abort
() {
282 TESTDATA
="testdata.$$"
283 if test $type = "scrub";
288 stopscrub
="nodeep-scrub"
292 run_mon
$dir a
--osd_pool_default_size=3 ||
return 1
293 run_mgr
$dir x ||
return 1
294 for osd
in $
(seq 0 $
(expr $OSDS - 1))
296 # Set scheduler to "wpq" until there's a reliable way to query scrub
297 # states with "--osd-scrub-sleep" set to 0. The "mclock_scheduler"
298 # overrides the scrub sleep to 0 and as a result the checks in the
300 run_osd
$dir $osd --osd_pool_default_pg_autoscale_mode=off \
301 --osd_deep_scrub_randomize_ratio=0.0 \
302 --osd_scrub_sleep=5.0 \
303 --osd_scrub_interval_randomize_ratio=0 \
304 --osd_op_queue=wpq ||
return 1
307 # Create a pool with a single pg
308 create_pool
$poolname 1 1
309 wait_for_clean ||
return 1
310 poolid
=$
(ceph osd dump |
grep "^pool.*[']${poolname}[']" |
awk '{ print $2 }')
312 dd if=/dev
/urandom of
=$TESTDATA bs
=1032 count
=1
313 for i
in `seq 1 $objects`
315 rados
-p $poolname put obj
${i} $TESTDATA
319 local primary
=$
(get_primary
$poolname obj1
)
320 local pgid
="${poolid}.0"
322 ceph tell
$pgid $type ||
return 1
323 # deep-scrub won't start without scrub noticing
324 if [ "$type" = "deep_scrub" ];
326 ceph tell
$pgid scrub ||
return 1
329 # Wait for scrubbing to start
332 for i
in $
(seq 0 200)
335 if ceph pg dump pgs |
grep ^
$pgid|
grep -q "scrubbing"
344 if test $found = "no";
346 echo "Scrubbing never started"
350 ceph osd
set $stopscrub
351 if [ "$type" = "deep_scrub" ];
356 # Wait for scrubbing to end
358 for i
in $
(seq 0 200)
361 if ceph pg dump pgs |
grep ^
$pgid |
grep -q "scrubbing"
372 if ! grep "$check set, aborting" $dir/osd.
${primary}.log
374 echo "Abort not seen in log"
378 local last_scrub
=$
(get_last_scrub_stamp
$pgid)
379 ceph config
set osd
"osd_scrub_sleep" "0.1"
381 ceph osd
unset $stopscrub
382 if [ "$type" = "deep_scrub" ];
384 ceph osd
unset noscrub
386 TIMEOUT
=$
(($objects / 2))
387 wait_for_scrub
$pgid "$last_scrub" ||
return 1
390 function TEST_scrub_abort
() {
392 _scrub_abort
$dir scrub
395 function TEST_deep_scrub_abort
() {
397 _scrub_abort
$dir deep_scrub
400 function TEST_scrub_permit_time
() {
406 TESTDATA
="testdata.$$"
408 run_mon
$dir a
--osd_pool_default_size=3 ||
return 1
409 run_mgr
$dir x ||
return 1
410 local scrub_begin_hour
=$
(date -d '2 hour ago' +"%H" |
sed 's/^0//')
411 local scrub_end_hour
=$
(date -d '1 hour ago' +"%H" |
sed 's/^0//')
412 for osd
in $
(seq 0 $
(expr $OSDS - 1))
414 run_osd
$dir $osd --bluestore_cache_autotune=false \
415 --osd_deep_scrub_randomize_ratio=0.0 \
416 --osd_scrub_interval_randomize_ratio=0 \
417 --osd_scrub_begin_hour=$scrub_begin_hour \
418 --osd_scrub_end_hour=$scrub_end_hour ||
return 1
421 # Create a pool with a single pg
422 create_pool
$poolname 1 1
423 wait_for_clean ||
return 1
425 # Trigger a scrub on a PG
426 local pgid
=$
(get_pg
$poolname SOMETHING
)
427 local primary
=$
(get_primary
$poolname SOMETHING
)
428 local last_scrub
=$
(get_last_scrub_stamp
$pgid)
429 # If we don't specify an amount of time to subtract from
430 # current time to set last_scrub_stamp, it sets the deadline
431 # back by osd_max_interval which would cause the time permit checking
432 # to be skipped. Set back 1 day, the default scrub_min_interval.
433 ceph tell
$pgid scrub $
(( 24 * 60 * 60 )) ||
return 1
435 # Scrub should not run
436 for ((i
=0; i
< 30; i
++)); do
437 if test "$(get_last_scrub_stamp $pgid)" '>' "$last_scrub" ; then
444 # a test to recreate the problem described in bug #52901 - setting 'noscrub'
445 # without explicitly preventing deep scrubs made the PG 'unscrubable'.
447 function TEST_just_deep_scrubs
() {
449 local -A cluster_conf
=(
455 standard_scrub_cluster
$dir cluster_conf
456 local poolid
=${cluster_conf['pool_id']}
457 local poolname
=${cluster_conf['pool_name']}
458 echo "Pool: $poolname : $poolid"
460 TESTDATA
="testdata.$$"
462 dd if=/dev
/urandom of
=$TESTDATA bs
=1032 count
=1
463 for i
in `seq 1 $objects`
465 rados
-p $poolname put obj
${i} $TESTDATA
469 # set both 'no scrub' & 'no deep-scrub', then request a deep-scrub.
470 # we do not expect to see the scrub scheduled.
472 ceph osd
set noscrub ||
return 1
473 ceph osd
set nodeep-scrub ||
return 1
474 sleep 6 # the 'noscrub' command takes a long time to reach the OSDs
475 local now_is
=`date -I"ns"`
476 declare -A sched_data
477 local pgid
="${poolid}.2"
479 # turn on the publishing of test data in the 'scrubber' section of 'pg query' output
480 set_query_debug
$pgid
482 extract_published_sch
$pgid $now_is $now_is sched_data
483 local saved_last_stamp
=${sched_data['query_last_stamp']}
484 local dbg_counter_at_start
=${sched_data['query_scrub_seq']}
485 echo "test counter @ start: $dbg_counter_at_start"
487 ceph pg
$pgid deep_scrub
489 sleep 5 # 5s is the 'pg dump' interval
491 extract_published_sch
$pgid $now_is $now_is sc_data_2
492 echo "test counter @ should show no change: " ${sc_data_2['query_scrub_seq']}
493 (( ${sc_data_2['dmp_last_duration']} == 0)) ||
return 1
494 (( ${sc_data_2['query_scrub_seq']} == $dbg_counter_at_start)) ||
return 1
496 # unset the 'no deep-scrub'. Deep scrubbing should start now.
497 ceph osd
unset nodeep-scrub ||
return 1
499 declare -A expct_qry_duration
=( ['query_last_duration']="0" ['query_last_duration_neg']="not0" )
501 echo "test counter @ should be higher than before the unset: " ${sc_data_2['query_scrub_seq']}
502 wait_any_cond
$pgid 10 $saved_last_stamp expct_qry_duration
"WaitingAfterScrub " sc_data_2 ||
return 1
505 function TEST_dump_scrub_schedule
() {
511 TESTDATA
="testdata.$$"
513 run_mon
$dir a
--osd_pool_default_size=$OSDS ||
return 1
514 run_mgr
$dir x ||
return 1
516 # Set scheduler to "wpq" until there's a reliable way to query scrub states
517 # with "--osd-scrub-sleep" set to 0. The "mclock_scheduler" overrides the
518 # scrub sleep to 0 and as a result the checks in the test fail.
519 local ceph_osd_args
="--osd_deep_scrub_randomize_ratio=0 \
520 --osd_scrub_interval_randomize_ratio=0 \
521 --osd_scrub_backoff_ratio=0.0 \
523 --osd_scrub_sleep=0.2"
525 for osd
in $
(seq 0 $
(expr $OSDS - 1))
527 run_osd
$dir $osd $ceph_osd_args||
return 1
530 # Create a pool with a single pg
531 create_pool
$poolname 1 1
532 wait_for_clean ||
return 1
533 poolid
=$
(ceph osd dump |
grep "^pool.*[']${poolname}[']" |
awk '{ print $2 }')
535 dd if=/dev
/urandom of
=$TESTDATA bs
=1032 count
=1
536 for i
in `seq 1 $objects`
538 rados
-p $poolname put obj
${i} $TESTDATA
542 local pgid
="${poolid}.0"
543 local now_is
=`date -I"ns"`
545 # before the scrubbing starts
547 # last scrub duration should be 0. The scheduling data should show
548 # a time in the future:
549 # e.g. 'periodic scrub scheduled @ 2021-10-12T20:32:43.645168+0000'
551 declare -A expct_starting
=( ['query_active']="false" ['query_is_future']="true" ['query_schedule']="scrub scheduled" )
552 declare -A sched_data
553 extract_published_sch
$pgid $now_is "2019-10-12T20:32:43.645168+0000" sched_data
554 schedule_against_expected sched_data expct_starting
"initial"
555 (( ${sched_data['dmp_last_duration']} == 0)) ||
return 1
556 echo "last-scrub --- " ${sched_data['query_last_scrub']}
559 # step 1: scrub once (mainly to ensure there is no urgency to scrub)
562 saved_last_stamp
=${sched_data['query_last_stamp']}
563 ceph tell osd.
* config
set osd_scrub_sleep
"0"
564 ceph pg deep-scrub
$pgid
567 # wait for the 'last duration' entries to change. Note that the 'dump' one will need
568 # up to 5 seconds to sync
572 declare -A expct_qry_duration
=( ['query_last_duration']="0" ['query_last_duration_neg']="not0" )
573 wait_any_cond
$pgid 10 $saved_last_stamp expct_qry_duration
"WaitingAfterScrub " sched_data ||
return 1
574 # verify that 'pg dump' also shows the change in last_scrub_duration
576 declare -A expct_dmp_duration
=( ['dmp_last_duration']="0" ['dmp_last_duration_neg']="not0" )
577 wait_any_cond
$pgid 10 $saved_last_stamp expct_dmp_duration
"WaitingAfterScrub_dmp " sched_data ||
return 1
582 # step 2: set noscrub and request a "periodic scrub". Watch for the change in the 'is the scrub
583 # scheduled for the future' value
586 ceph tell osd.
* config
set osd_scrub_chunk_max
"3" ||
return 1
587 ceph tell osd.
* config
set osd_scrub_sleep
"1.0" ||
return 1
588 ceph osd
set noscrub ||
return 1
590 saved_last_stamp
=${sched_data['query_last_stamp']}
595 declare -A expct_scrub_peri_sched
=( ['query_is_future']="false" )
596 wait_any_cond
$pgid 10 $saved_last_stamp expct_scrub_peri_sched
"waitingBeingScheduled" sched_data ||
return 1
598 # note: the induced change in 'last_scrub_stamp' that we've caused above, is by itself not a publish-stats
599 # trigger. Thus it might happen that the information in 'pg dump' will not get updated here. Do not expect
600 # 'dmp_is_future' to follow 'query_is_future' without a good reason
601 ## declare -A expct_scrub_peri_sched_dmp=( ['dmp_is_future']="false" )
602 ## wait_any_cond $pgid 15 $saved_last_stamp expct_scrub_peri_sched_dmp "waitingBeingScheduled" sched_data || echo "must be fixed"
605 # step 3: allow scrubs. Watch for the conditions during the scrubbing
608 saved_last_stamp
=${sched_data['query_last_stamp']}
609 ceph osd
unset noscrub
611 declare -A cond_active
=( ['query_active']="true" )
613 wait_any_cond
$pgid 10 $saved_last_stamp cond_active
"WaitingActive " sched_data ||
return 1
615 # check for pg-dump to show being active. But if we see 'query_active' being reset - we've just
617 declare -A cond_active_dmp
=( ['dmp_state_has_scrubbing']="true" ['query_active']="false" )
619 wait_any_cond
$pgid 10 $saved_last_stamp cond_active_dmp
"WaitingActive " sched_data ||
return 1
622 function TEST_pg_dump_objects_scrubbed
() {
629 TESTDATA
="testdata.$$"
631 setup
$dir ||
return 1
632 run_mon
$dir a
--osd_pool_default_size=$OSDS ||
return 1
633 run_mgr
$dir x ||
return 1
634 for osd
in $
(seq 0 $
(expr $OSDS - 1))
636 run_osd
$dir $osd ||
return 1
639 # Create a pool with a single pg
640 create_pool
$poolname 1 1
641 wait_for_clean ||
return 1
642 poolid
=$
(ceph osd dump |
grep "^pool.*[']${poolname}[']" |
awk '{ print $2 }')
644 dd if=/dev
/urandom of
=$TESTDATA bs
=1032 count
=1
645 for i
in `seq 1 $objects`
647 rados
-p $poolname put obj
${i} $TESTDATA
651 local pgid
="${poolid}.0"
652 #Trigger a scrub on a PG
653 pg_scrub
$pgid ||
return 1
654 test "$(ceph pg $pgid query | jq '.info.stats.objects_scrubbed')" '=' $objects ||
return 1
656 teardown
$dir ||
return 1
659 main osd-scrub-test
"$@"
662 # compile-command: "cd build ; make -j4 && \
663 # ../qa/run-standalone.sh osd-scrub-test.sh"