]> git.proxmox.com Git - ceph.git/blame - ceph/qa/standalone/scrub/osd-mapper.sh
update ceph source to reef 18.1.2
[ceph.git] / ceph / qa / standalone / scrub / osd-mapper.sh
CommitLineData
1e59de90
TL
1#!/usr/bin/env bash
2# -*- mode:text; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
3# vim: ts=8 sw=2 smarttab
4#
5# test the handling of a corrupted SnapMapper DB by Scrub
6
7source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
8source $CEPH_ROOT/qa/standalone/scrub/scrub-helpers.sh
9
10function run() {
11 local dir=$1
12 shift
13
14 export CEPH_MON="127.0.0.1:7144" # git grep '\<7144\>' : there must be only one
15 export CEPH_ARGS
16 CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
17 CEPH_ARGS+="--mon-host=$CEPH_MON "
18
19 export -n CEPH_CLI_TEST_DUP_COMMAND
20 local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
21 for func in $funcs ; do
22 setup $dir || return 1
23 $func $dir || return 1
24 teardown $dir || return 1
25 done
26}
27
28# one clone & multiple snaps (according to the number of parameters)
29function make_a_clone()
30{
31 #turn off '-x' (but remember previous state)
32 local saved_echo_flag=${-//[^x]/}
33 set +x
34 local pool=$1
35 local obj=$2
36 echo $RANDOM | rados -p $pool put $obj - || return 1
37 shift 2
38 for snap in $@ ; do
39 rados -p $pool mksnap $snap || return 1
40 done
41 if [[ -n "$saved_echo_flag" ]]; then set -x; fi
42}
43
44function TEST_truncated_sna_record() {
45 local dir=$1
46 local -A cluster_conf=(
47 ['osds_num']="3"
48 ['pgs_in_pool']="4"
49 ['pool_name']="test"
50 )
51
52 local extr_dbg=3
53 (( extr_dbg > 1 )) && echo "Dir: $dir"
54 standard_scrub_cluster $dir cluster_conf
55 ceph tell osd.* config set osd_stats_update_period_not_scrubbing "1"
56 ceph tell osd.* config set osd_stats_update_period_scrubbing "1"
57
58 local osdn=${cluster_conf['osds_num']}
59 local poolid=${cluster_conf['pool_id']}
60 local poolname=${cluster_conf['pool_name']}
61 local objname="objxxx"
62
63 # create an object and clone it
64 make_a_clone $poolname $objname snap01 snap02 || return 1
65 make_a_clone $poolname $objname snap13 || return 1
66 make_a_clone $poolname $objname snap24 snap25 || return 1
67 echo $RANDOM | rados -p $poolname put $objname - || return 1
68
69 #identify the PG and the primary OSD
70 local pgid=`ceph --format=json-pretty osd map $poolname $objname | jq -r '.pgid'`
71 local osd=`ceph --format=json-pretty osd map $poolname $objname | jq -r '.up[0]'`
72 echo "pgid is $pgid (primary: osd.$osd)"
73 # turn on the publishing of test data in the 'scrubber' section of 'pg query' output
74 set_query_debug $pgid
75
76 # verify the existence of these clones
77 (( extr_dbg >= 1 )) && rados --format json-pretty -p $poolname listsnaps $objname
78
79 # scrub the PG
80 ceph pg $pgid deep_scrub || return 1
81
82 # we aren't just waiting for the scrub to terminate, but also for the
83 # logs to be published
84 sleep 3
85 ceph pg dump pgs
86 until grep -a -q -- "event: --^^^^---- ScrubFinished" $dir/osd.$osd.log ; do
87 sleep 0.2
88 done
89
90 ceph pg dump pgs
91 ceph osd set noscrub || return 1
92 ceph osd set nodeep-scrub || return 1
93 sleep 5
94 grep -a -q -v "ERR" $dir/osd.$osd.log || return 1
95
96 # kill the OSDs
97 kill_daemons $dir TERM osd || return 1
98
99 (( extr_dbg >= 2 )) && ceph-kvstore-tool bluestore-kv $dir/0 dump "p"
100 (( extr_dbg >= 2 )) && ceph-kvstore-tool bluestore-kv $dir/2 dump "p" | grep -a SNA_
101 (( extr_dbg >= 2 )) && grep -a SNA_ /tmp/oo2.dump
102 (( extr_dbg >= 2 )) && ceph-kvstore-tool bluestore-kv $dir/2 dump p 2> /dev/null
103 local num_sna_b4=`ceph-kvstore-tool bluestore-kv $dir/$osd dump p 2> /dev/null | grep -a -e 'SNA_[0-9]_000000000000000[0-9]_000000000000000' \
104 | awk -e '{print $2;}' | wc -l`
105
106 for sdn in $(seq 0 $(expr $osdn - 1))
107 do
108 kvdir=$dir/$sdn
109 echo "corrupting the SnapMapper DB of osd.$sdn (db: $kvdir)"
110 (( extr_dbg >= 3 )) && ceph-kvstore-tool bluestore-kv $kvdir dump "p"
111
112 # truncate the 'mapping' (SNA_) entry corresponding to the snap13 clone
113 KY=`ceph-kvstore-tool bluestore-kv $kvdir dump p 2> /dev/null | grep -a -e 'SNA_[0-9]_0000000000000003_000000000000000' \
114 | awk -e '{print $2;}'`
115 (( extr_dbg >= 1 )) && echo "SNA key: $KY" | cat -v
116
117 tmp_fn1=`mktemp -p /tmp --suffix="_the_val"`
118 (( extr_dbg >= 1 )) && echo "Value dumped in: $tmp_fn1"
119 ceph-kvstore-tool bluestore-kv $kvdir get p "$KY" out $tmp_fn1 2> /dev/null
120 (( extr_dbg >= 2 )) && od -xc $tmp_fn1
121
122 NKY=${KY:0:-30}
123 ceph-kvstore-tool bluestore-kv $kvdir rm "p" "$KY" 2> /dev/null
124 ceph-kvstore-tool bluestore-kv $kvdir set "p" "$NKY" in $tmp_fn1 2> /dev/null
125
126 (( extr_dbg >= 1 )) || rm $tmp_fn1
127 done
128
129 orig_osd_args=" ${cluster_conf['osd_args']}"
130 orig_osd_args=" $(echo $orig_osd_args)"
131 (( extr_dbg >= 2 )) && echo "Copied OSD args: /$orig_osd_args/ /${orig_osd_args:1}/"
132 for sdn in $(seq 0 $(expr $osdn - 1))
133 do
134 CEPH_ARGS="$CEPH_ARGS $orig_osd_args" activate_osd $dir $sdn
135 done
136 sleep 1
137
138 for sdn in $(seq 0 $(expr $osdn - 1))
139 do
140 timeout 60 ceph tell osd.$sdn version
141 done
142 rados --format json-pretty -p $poolname listsnaps $objname
143
144 # when scrubbing now - we expect the scrub to emit a cluster log ERR message regarding SnapMapper internal inconsistency
145 ceph osd unset nodeep-scrub || return 1
146 ceph osd unset noscrub || return 1
147
148 # what is the primary now?
149 local cur_prim=`ceph --format=json-pretty osd map $poolname $objname | jq -r '.up[0]'`
150 ceph pg dump pgs
151 sleep 2
152 ceph pg $pgid deep_scrub || return 1
153 sleep 5
154 ceph pg dump pgs
155 (( extr_dbg >= 1 )) && grep -a "ERR" $dir/osd.$cur_prim.log
156 grep -a -q "ERR" $dir/osd.$cur_prim.log || return 1
157
158 # but did we fix the snap issue? let's try scrubbing again
159
160 local prev_err_cnt=`grep -a "ERR" $dir/osd.$cur_prim.log | wc -l`
161 echo "prev count: $prev_err_cnt"
162
163 # scrub again. No errors expected this time
164 ceph pg $pgid deep_scrub || return 1
165 sleep 5
166 ceph pg dump pgs
167 (( extr_dbg >= 1 )) && grep -a "ERR" $dir/osd.$cur_prim.log
168 local current_err_cnt=`grep -a "ERR" $dir/osd.$cur_prim.log | wc -l`
169 (( extr_dbg >= 1 )) && echo "current count: $current_err_cnt"
170 (( current_err_cnt == prev_err_cnt )) || return 1
171 kill_daemons $dir TERM osd || return 1
172 kvdir=$dir/$cur_prim
173 (( extr_dbg >= 2 )) && ceph-kvstore-tool bluestore-kv $kvdir dump p 2> /dev/null | grep -a -e 'SNA_[0-9]_' \
174 | awk -e '{print $2;}'
175 local num_sna_full=`ceph-kvstore-tool bluestore-kv $kvdir dump p 2> /dev/null | grep -a -e 'SNA_[0-9]_000000000000000[0-9]_000000000000000' \
176 | awk -e '{print $2;}' | wc -l`
177 (( num_sna_full == num_sna_b4 )) || return 1
178 return 0
179}
180
181
182main osd-mapper "$@"