]>
Commit | Line | Data |
---|---|---|
1e59de90 TL |
1 | #!/usr/bin/env bash |
2 | # -*- mode:text; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- | |
3 | # vim: ts=8 sw=2 smarttab | |
4 | # | |
5 | # test the handling of a corrupted SnapMapper DB by Scrub | |
6 | ||
7 | source $CEPH_ROOT/qa/standalone/ceph-helpers.sh | |
8 | source $CEPH_ROOT/qa/standalone/scrub/scrub-helpers.sh | |
9 | ||
10 | function run() { | |
11 | local dir=$1 | |
12 | shift | |
13 | ||
14 | export CEPH_MON="127.0.0.1:7144" # git grep '\<7144\>' : there must be only one | |
15 | export CEPH_ARGS | |
16 | CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " | |
17 | CEPH_ARGS+="--mon-host=$CEPH_MON " | |
18 | ||
19 | export -n CEPH_CLI_TEST_DUP_COMMAND | |
20 | local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} | |
21 | for func in $funcs ; do | |
22 | setup $dir || return 1 | |
23 | $func $dir || return 1 | |
24 | teardown $dir || return 1 | |
25 | done | |
26 | } | |
27 | ||
28 | # one clone & multiple snaps (according to the number of parameters) | |
29 | function make_a_clone() | |
30 | { | |
31 | #turn off '-x' (but remember previous state) | |
32 | local saved_echo_flag=${-//[^x]/} | |
33 | set +x | |
34 | local pool=$1 | |
35 | local obj=$2 | |
36 | echo $RANDOM | rados -p $pool put $obj - || return 1 | |
37 | shift 2 | |
38 | for snap in $@ ; do | |
39 | rados -p $pool mksnap $snap || return 1 | |
40 | done | |
41 | if [[ -n "$saved_echo_flag" ]]; then set -x; fi | |
42 | } | |
43 | ||
44 | function TEST_truncated_sna_record() { | |
45 | local dir=$1 | |
46 | local -A cluster_conf=( | |
47 | ['osds_num']="3" | |
48 | ['pgs_in_pool']="4" | |
49 | ['pool_name']="test" | |
50 | ) | |
51 | ||
52 | local extr_dbg=3 | |
53 | (( extr_dbg > 1 )) && echo "Dir: $dir" | |
54 | standard_scrub_cluster $dir cluster_conf | |
55 | ceph tell osd.* config set osd_stats_update_period_not_scrubbing "1" | |
56 | ceph tell osd.* config set osd_stats_update_period_scrubbing "1" | |
57 | ||
58 | local osdn=${cluster_conf['osds_num']} | |
59 | local poolid=${cluster_conf['pool_id']} | |
60 | local poolname=${cluster_conf['pool_name']} | |
61 | local objname="objxxx" | |
62 | ||
63 | # create an object and clone it | |
64 | make_a_clone $poolname $objname snap01 snap02 || return 1 | |
65 | make_a_clone $poolname $objname snap13 || return 1 | |
66 | make_a_clone $poolname $objname snap24 snap25 || return 1 | |
67 | echo $RANDOM | rados -p $poolname put $objname - || return 1 | |
68 | ||
69 | #identify the PG and the primary OSD | |
70 | local pgid=`ceph --format=json-pretty osd map $poolname $objname | jq -r '.pgid'` | |
71 | local osd=`ceph --format=json-pretty osd map $poolname $objname | jq -r '.up[0]'` | |
72 | echo "pgid is $pgid (primary: osd.$osd)" | |
73 | # turn on the publishing of test data in the 'scrubber' section of 'pg query' output | |
74 | set_query_debug $pgid | |
75 | ||
76 | # verify the existence of these clones | |
77 | (( extr_dbg >= 1 )) && rados --format json-pretty -p $poolname listsnaps $objname | |
78 | ||
79 | # scrub the PG | |
80 | ceph pg $pgid deep_scrub || return 1 | |
81 | ||
82 | # we aren't just waiting for the scrub to terminate, but also for the | |
83 | # logs to be published | |
84 | sleep 3 | |
85 | ceph pg dump pgs | |
86 | until grep -a -q -- "event: --^^^^---- ScrubFinished" $dir/osd.$osd.log ; do | |
87 | sleep 0.2 | |
88 | done | |
89 | ||
90 | ceph pg dump pgs | |
91 | ceph osd set noscrub || return 1 | |
92 | ceph osd set nodeep-scrub || return 1 | |
93 | sleep 5 | |
94 | grep -a -q -v "ERR" $dir/osd.$osd.log || return 1 | |
95 | ||
96 | # kill the OSDs | |
97 | kill_daemons $dir TERM osd || return 1 | |
98 | ||
99 | (( extr_dbg >= 2 )) && ceph-kvstore-tool bluestore-kv $dir/0 dump "p" | |
100 | (( extr_dbg >= 2 )) && ceph-kvstore-tool bluestore-kv $dir/2 dump "p" | grep -a SNA_ | |
101 | (( extr_dbg >= 2 )) && grep -a SNA_ /tmp/oo2.dump | |
102 | (( extr_dbg >= 2 )) && ceph-kvstore-tool bluestore-kv $dir/2 dump p 2> /dev/null | |
103 | local num_sna_b4=`ceph-kvstore-tool bluestore-kv $dir/$osd dump p 2> /dev/null | grep -a -e 'SNA_[0-9]_000000000000000[0-9]_000000000000000' \ | |
104 | | awk -e '{print $2;}' | wc -l` | |
105 | ||
106 | for sdn in $(seq 0 $(expr $osdn - 1)) | |
107 | do | |
108 | kvdir=$dir/$sdn | |
109 | echo "corrupting the SnapMapper DB of osd.$sdn (db: $kvdir)" | |
110 | (( extr_dbg >= 3 )) && ceph-kvstore-tool bluestore-kv $kvdir dump "p" | |
111 | ||
112 | # truncate the 'mapping' (SNA_) entry corresponding to the snap13 clone | |
113 | KY=`ceph-kvstore-tool bluestore-kv $kvdir dump p 2> /dev/null | grep -a -e 'SNA_[0-9]_0000000000000003_000000000000000' \ | |
114 | | awk -e '{print $2;}'` | |
115 | (( extr_dbg >= 1 )) && echo "SNA key: $KY" | cat -v | |
116 | ||
117 | tmp_fn1=`mktemp -p /tmp --suffix="_the_val"` | |
118 | (( extr_dbg >= 1 )) && echo "Value dumped in: $tmp_fn1" | |
119 | ceph-kvstore-tool bluestore-kv $kvdir get p "$KY" out $tmp_fn1 2> /dev/null | |
120 | (( extr_dbg >= 2 )) && od -xc $tmp_fn1 | |
121 | ||
122 | NKY=${KY:0:-30} | |
123 | ceph-kvstore-tool bluestore-kv $kvdir rm "p" "$KY" 2> /dev/null | |
124 | ceph-kvstore-tool bluestore-kv $kvdir set "p" "$NKY" in $tmp_fn1 2> /dev/null | |
125 | ||
126 | (( extr_dbg >= 1 )) || rm $tmp_fn1 | |
127 | done | |
128 | ||
129 | orig_osd_args=" ${cluster_conf['osd_args']}" | |
130 | orig_osd_args=" $(echo $orig_osd_args)" | |
131 | (( extr_dbg >= 2 )) && echo "Copied OSD args: /$orig_osd_args/ /${orig_osd_args:1}/" | |
132 | for sdn in $(seq 0 $(expr $osdn - 1)) | |
133 | do | |
134 | CEPH_ARGS="$CEPH_ARGS $orig_osd_args" activate_osd $dir $sdn | |
135 | done | |
136 | sleep 1 | |
137 | ||
138 | for sdn in $(seq 0 $(expr $osdn - 1)) | |
139 | do | |
140 | timeout 60 ceph tell osd.$sdn version | |
141 | done | |
142 | rados --format json-pretty -p $poolname listsnaps $objname | |
143 | ||
144 | # when scrubbing now - we expect the scrub to emit a cluster log ERR message regarding SnapMapper internal inconsistency | |
145 | ceph osd unset nodeep-scrub || return 1 | |
146 | ceph osd unset noscrub || return 1 | |
147 | ||
148 | # what is the primary now? | |
149 | local cur_prim=`ceph --format=json-pretty osd map $poolname $objname | jq -r '.up[0]'` | |
150 | ceph pg dump pgs | |
151 | sleep 2 | |
152 | ceph pg $pgid deep_scrub || return 1 | |
153 | sleep 5 | |
154 | ceph pg dump pgs | |
155 | (( extr_dbg >= 1 )) && grep -a "ERR" $dir/osd.$cur_prim.log | |
156 | grep -a -q "ERR" $dir/osd.$cur_prim.log || return 1 | |
157 | ||
158 | # but did we fix the snap issue? let's try scrubbing again | |
159 | ||
160 | local prev_err_cnt=`grep -a "ERR" $dir/osd.$cur_prim.log | wc -l` | |
161 | echo "prev count: $prev_err_cnt" | |
162 | ||
163 | # scrub again. No errors expected this time | |
164 | ceph pg $pgid deep_scrub || return 1 | |
165 | sleep 5 | |
166 | ceph pg dump pgs | |
167 | (( extr_dbg >= 1 )) && grep -a "ERR" $dir/osd.$cur_prim.log | |
168 | local current_err_cnt=`grep -a "ERR" $dir/osd.$cur_prim.log | wc -l` | |
169 | (( extr_dbg >= 1 )) && echo "current count: $current_err_cnt" | |
170 | (( current_err_cnt == prev_err_cnt )) || return 1 | |
171 | kill_daemons $dir TERM osd || return 1 | |
172 | kvdir=$dir/$cur_prim | |
173 | (( extr_dbg >= 2 )) && ceph-kvstore-tool bluestore-kv $kvdir dump p 2> /dev/null | grep -a -e 'SNA_[0-9]_' \ | |
174 | | awk -e '{print $2;}' | |
175 | local num_sna_full=`ceph-kvstore-tool bluestore-kv $kvdir dump p 2> /dev/null | grep -a -e 'SNA_[0-9]_000000000000000[0-9]_000000000000000' \ | |
176 | | awk -e '{print $2;}' | wc -l` | |
177 | (( num_sna_full == num_sna_b4 )) || return 1 | |
178 | return 0 | |
179 | } | |
180 | ||
181 | ||
182 | main osd-mapper "$@" |