2 # -*- mode:text; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
3 # vim: ts=8 sw=2 smarttab
5 # test the handling of a corrupted SnapMapper DB by Scrub
7 source $CEPH_ROOT/qa
/standalone
/ceph-helpers.sh
8 source $CEPH_ROOT/qa
/standalone
/scrub
/scrub-helpers.sh
14 export CEPH_MON
="127.0.0.1:7144" # git grep '\<7144\>' : there must be only one
16 CEPH_ARGS
+="--fsid=$(uuidgen) --auth-supported=none "
17 CEPH_ARGS
+="--mon-host=$CEPH_MON "
19 export -n CEPH_CLI_TEST_DUP_COMMAND
20 local funcs
=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
21 for func
in $funcs ; do
22 setup
$dir ||
return 1
23 $func $dir ||
return 1
24 teardown
$dir ||
return 1
28 # one clone & multiple snaps (according to the number of parameters)
29 function make_a_clone
()
31 #turn off '-x' (but remember previous state)
32 local saved_echo_flag
=${-//[^x]/}
36 echo $RANDOM | rados
-p $pool put
$obj - ||
return 1
39 rados
-p $pool mksnap
$snap ||
return 1
41 if [[ -n "$saved_echo_flag" ]]; then set -x; fi
44 function TEST_truncated_sna_record
() {
46 local -A cluster_conf
=(
53 (( extr_dbg
> 1 )) && echo "Dir: $dir"
54 standard_scrub_cluster
$dir cluster_conf
55 ceph tell osd.
* config
set osd_stats_update_period_not_scrubbing
"1"
56 ceph tell osd.
* config
set osd_stats_update_period_scrubbing
"1"
58 local osdn
=${cluster_conf['osds_num']}
59 local poolid
=${cluster_conf['pool_id']}
60 local poolname
=${cluster_conf['pool_name']}
61 local objname
="objxxx"
63 # create an object and clone it
64 make_a_clone
$poolname $objname snap01 snap02 ||
return 1
65 make_a_clone
$poolname $objname snap13 ||
return 1
66 make_a_clone
$poolname $objname snap24 snap25 ||
return 1
67 echo $RANDOM | rados
-p $poolname put
$objname - ||
return 1
69 #identify the PG and the primary OSD
70 local pgid
=`ceph --format=json-pretty osd map $poolname $objname | jq -r '.pgid'`
71 local osd
=`ceph --format=json-pretty osd map $poolname $objname | jq -r '.up[0]'`
72 echo "pgid is $pgid (primary: osd.$osd)"
73 # turn on the publishing of test data in the 'scrubber' section of 'pg query' output
76 # verify the existence of these clones
77 (( extr_dbg
>= 1 )) && rados
--format json-pretty
-p $poolname listsnaps
$objname
80 ceph pg
$pgid deep_scrub ||
return 1
82 # we aren't just waiting for the scrub to terminate, but also for the
83 # logs to be published
86 until grep -a -q -- "event: --^^^^---- ScrubFinished" $dir/osd.
$osd.log
; do
91 ceph osd
set noscrub ||
return 1
92 ceph osd
set nodeep-scrub ||
return 1
94 grep -a -q -v "ERR" $dir/osd.
$osd.log ||
return 1
97 kill_daemons
$dir TERM osd ||
return 1
99 (( extr_dbg
>= 2 )) && ceph-kvstore-tool bluestore-kv
$dir/0 dump
"p"
100 (( extr_dbg
>= 2 )) && ceph-kvstore-tool bluestore-kv
$dir/2 dump
"p" |
grep -a SNA_
101 (( extr_dbg
>= 2 )) && grep -a SNA_
/tmp
/oo2.dump
102 (( extr_dbg
>= 2 )) && ceph-kvstore-tool bluestore-kv
$dir/2 dump p
2> /dev
/null
103 local num_sna_b4
=`ceph-kvstore-tool bluestore-kv $dir/$osd dump p 2> /dev/null | grep -a -e 'SNA_[0-9]_000000000000000[0-9]_000000000000000' \
104 | awk -e '{print $2;}' | wc -l`
106 for sdn
in $
(seq 0 $
(expr $osdn - 1))
109 echo "corrupting the SnapMapper DB of osd.$sdn (db: $kvdir)"
110 (( extr_dbg
>= 3 )) && ceph-kvstore-tool bluestore-kv
$kvdir dump
"p"
112 # truncate the 'mapping' (SNA_) entry corresponding to the snap13 clone
113 KY
=`ceph-kvstore-tool bluestore-kv $kvdir dump p 2> /dev/null | grep -a -e 'SNA_[0-9]_0000000000000003_000000000000000' \
114 | awk -e '{print $2;}'`
115 (( extr_dbg
>= 1 )) && echo "SNA key: $KY" |
cat -v
117 tmp_fn1
=`mktemp -p /tmp --suffix="_the_val"`
118 (( extr_dbg
>= 1 )) && echo "Value dumped in: $tmp_fn1"
119 ceph-kvstore-tool bluestore-kv
$kvdir get p
"$KY" out
$tmp_fn1 2> /dev
/null
120 (( extr_dbg
>= 2 )) && od -xc $tmp_fn1
123 ceph-kvstore-tool bluestore-kv
$kvdir rm "p" "$KY" 2> /dev
/null
124 ceph-kvstore-tool bluestore-kv
$kvdir set "p" "$NKY" in $tmp_fn1 2> /dev
/null
126 (( extr_dbg
>= 1 )) ||
rm $tmp_fn1
129 orig_osd_args
=" ${cluster_conf['osd_args']}"
130 orig_osd_args
=" $(echo $orig_osd_args)"
131 (( extr_dbg
>= 2 )) && echo "Copied OSD args: /$orig_osd_args/ /${orig_osd_args:1}/"
132 for sdn
in $
(seq 0 $
(expr $osdn - 1))
134 CEPH_ARGS
="$CEPH_ARGS $orig_osd_args" activate_osd
$dir $sdn
138 for sdn
in $
(seq 0 $
(expr $osdn - 1))
140 timeout
60 ceph tell osd.
$sdn version
142 rados
--format json-pretty
-p $poolname listsnaps
$objname
144 # when scrubbing now - we expect the scrub to emit a cluster log ERR message regarding SnapMapper internal inconsistency
145 ceph osd
unset nodeep-scrub ||
return 1
146 ceph osd
unset noscrub ||
return 1
148 # what is the primary now?
149 local cur_prim
=`ceph --format=json-pretty osd map $poolname $objname | jq -r '.up[0]'`
152 ceph pg
$pgid deep_scrub ||
return 1
155 (( extr_dbg
>= 1 )) && grep -a "ERR" $dir/osd.
$cur_prim.log
156 grep -a -q "ERR" $dir/osd.
$cur_prim.log ||
return 1
158 # but did we fix the snap issue? let's try scrubbing again
160 local prev_err_cnt
=`grep -a "ERR" $dir/osd.$cur_prim.log | wc -l`
161 echo "prev count: $prev_err_cnt"
163 # scrub again. No errors expected this time
164 ceph pg
$pgid deep_scrub ||
return 1
167 (( extr_dbg
>= 1 )) && grep -a "ERR" $dir/osd.
$cur_prim.log
168 local current_err_cnt
=`grep -a "ERR" $dir/osd.$cur_prim.log | wc -l`
169 (( extr_dbg
>= 1 )) && echo "current count: $current_err_cnt"
170 (( current_err_cnt
== prev_err_cnt
)) ||
return 1
171 kill_daemons
$dir TERM osd ||
return 1
173 (( extr_dbg
>= 2 )) && ceph-kvstore-tool bluestore-kv
$kvdir dump p
2> /dev
/null |
grep -a -e 'SNA_[0-9]_' \
174 |
awk -e '{print $2;}'
175 local num_sna_full
=`ceph-kvstore-tool bluestore-kv $kvdir dump p 2> /dev/null | grep -a -e 'SNA_[0-9]_000000000000000[0-9]_000000000000000' \
176 | awk -e '{print $2;}' | wc -l`
177 (( num_sna_full
== num_sna_b4
)) ||
return 1