]> git.proxmox.com Git - ceph.git/blob - ceph/qa/standalone/scrub/scrub-helpers.sh
update ceph source to reef 18.1.2
[ceph.git] / ceph / qa / standalone / scrub / scrub-helpers.sh
1 #!/usr/bin/env bash
2 # @file scrub-helpers.sh
3 # @brief a collection of bash functions useful for scrub standalone tests
4 #
5
6 # extract_published_sch()
7 #
8 # Use the output from both 'ceph pg dump pgs' and 'ceph pg x.x query' commands to determine
9 # the published scrub scheduling status of a given PG.
10 #
11 # $1: pg id
12 # $2: 'current' time to compare to
13 # $3: an additional time-point to compare to
14 # $4: [out] dictionary
15 #
16 function extract_published_sch() {
17 local pgn="$1"
18 local -n dict=$4 # a ref to the in/out dictionary
19 local current_time=$2
20 local extra_time=$3
21 local extr_dbg=1 # note: 3 and above leave some temp files around
22
23 #turn off '-x' (but remember previous state)
24 local saved_echo_flag=${-//[^x]/}
25 set +x
26
27 (( extr_dbg >= 3 )) && ceph pg dump pgs -f json-pretty >> /tmp/a_dmp$$
28 (( extr_dbg >= 3 )) && ceph pg $1 query -f json-pretty >> /tmp/a_qry$$
29
30 from_dmp=`ceph pg dump pgs -f json-pretty | jq -r --arg pgn "$pgn" --arg extra_dt "$extra_time" --arg current_dt "$current_time" '[
31 [[.pg_stats[]] | group_by(.pg_stats)][0][0] |
32 [.[] |
33 select(has("pgid") and .pgid == $pgn) |
34
35 (.dmp_stat_part=(.scrub_schedule | if test(".*@.*") then (split(" @ ")|first) else . end)) |
36 (.dmp_when_part=(.scrub_schedule | if test(".*@.*") then (split(" @ ")|last) else "0" end)) |
37
38 [ {
39 dmp_pg_state: .state,
40 dmp_state_has_scrubbing: (.state | test(".*scrub.*";"i")),
41 dmp_last_duration:.last_scrub_duration,
42 dmp_schedule: .dmp_stat_part,
43 dmp_schedule_at: .dmp_when_part,
44 dmp_is_future: ( .dmp_when_part > $current_dt ),
45 dmp_vs_date: ( .dmp_when_part > $extra_dt ),
46 dmp_reported_epoch: .reported_epoch,
47 dmp_seq: .reported_seq
48 }] ]][][][]'`
49
50 (( extr_dbg >= 2 )) && echo "from pg dump pg: $from_dmp"
51 (( extr_dbg >= 2 )) && echo "query output:"
52 (( extr_dbg >= 2 )) && ceph pg $1 query -f json-pretty | awk -e '/scrubber/,/agent_state/ {print;}'
53
54 from_qry=`ceph pg $1 query -f json-pretty | jq -r --arg extra_dt "$extra_time" --arg current_dt "$current_time" --arg spt "'" '
55 . |
56 (.q_stat_part=((.scrubber.schedule// "-") | if test(".*@.*") then (split(" @ ")|first) else . end)) |
57 (.q_when_part=((.scrubber.schedule// "0") | if test(".*@.*") then (split(" @ ")|last) else "0" end)) |
58 (.q_when_is_future=(.q_when_part > $current_dt)) |
59 (.q_vs_date=(.q_when_part > $extra_dt)) |
60 {
61 query_epoch: .epoch,
62 query_seq: .info.stats.reported_seq,
63 query_active: (.scrubber | if has("active") then .active else "bug" end),
64 query_schedule: .q_stat_part,
65 query_schedule_at: .q_when_part,
66 query_last_duration: .info.stats.last_scrub_duration,
67 query_last_stamp: .info.history.last_scrub_stamp,
68 query_last_scrub: (.info.history.last_scrub| sub($spt;"x") ),
69 query_is_future: .q_when_is_future,
70 query_vs_date: .q_vs_date,
71 query_scrub_seq: .scrubber.test_sequence
72 }
73 '`
74 (( extr_dbg >= 1 )) && echo $from_qry " " $from_dmp | jq -s -r 'add | "(",(to_entries | .[] | "["+(.key)+"]="+(.value|@sh)),")"'
75
76 # note that using a ref to an associative array directly is tricky. Instead - we are copying:
77 local -A dict_src=`echo $from_qry " " $from_dmp | jq -s -r 'add | "(",(to_entries | .[] | "["+(.key)+"]="+(.value|@sh)),")"'`
78 dict=()
79 for k in "${!dict_src[@]}"; do dict[$k]=${dict_src[$k]}; done
80
81 if [[ -n "$saved_echo_flag" ]]; then set -x; fi
82 }
83
84 # query the PG, until any of the conditions in the 'expected' array are met
85 #
86 # A condition may be negated by an additional entry in the 'expected' array. Its
87 # form should be:
88 # key: the original key, with a "_neg" suffix;
89 # Value: not checked
90 #
91 # $1: pg id
92 # $2: max retries
93 # $3: a date to use in comparisons
94 # $4: set of K/V conditions
95 # $5: debug message
96 # $6: [out] the results array
97 function wait_any_cond() {
98 local pgid="$1"
99 local retries=$2
100 local cmp_date=$3
101 local -n ep=$4
102 local -n out_array=$6
103 local -A sc_data
104 local extr_dbg=2
105
106 #turn off '-x' (but remember previous state)
107 local saved_echo_flag=${-//[^x]/}
108 set +x
109
110 local now_is=`date -I"ns"`
111 (( extr_dbg >= 2 )) && echo "waiting for any condition ($5): pg:$pgid dt:$cmp_date ($retries retries)"
112
113 for i in $(seq 1 $retries)
114 do
115 sleep 0.5
116 extract_published_sch $pgid $now_is $cmp_date sc_data
117 (( extr_dbg >= 4 )) && echo "${sc_data['dmp_last_duration']}"
118 (( extr_dbg >= 4 )) && echo "----> loop: $i ~ ${sc_data['dmp_last_duration']} / " ${sc_data['query_vs_date']} " / ${sc_data['dmp_is_future']}"
119 (( extr_dbg >= 2 )) && echo "--> loop: $i ~ ${sc_data['query_active']} / ${sc_data['query_seq']} / ${sc_data['dmp_seq']} " \
120 "/ ${sc_data['query_is_future']} / ${sc_data['query_last_stamp']} / ${sc_data['query_schedule']} %%% ${!ep[@]}"
121
122 # perform schedule_against_expected(), but with slightly different out-messages behaviour
123 for k_ref in "${!ep[@]}"
124 do
125 (( extr_dbg >= 3 )) && echo "key is $k_ref"
126 # is this a real key, or just a negation flag for another key??
127 [[ $k_ref =~ "_neg" ]] && continue
128
129 local act_val=${sc_data[$k_ref]}
130 local exp_val=${ep[$k_ref]}
131
132 # possible negation? look for a matching key
133 local neg_key="${k_ref}_neg"
134 (( extr_dbg >= 3 )) && echo "neg-key is $neg_key"
135 if [ -v 'ep[$neg_key]' ]; then
136 is_neg=1
137 else
138 is_neg=0
139 fi
140
141 (( extr_dbg >= 1 )) && echo "key is $k_ref: negation:$is_neg # expected: $exp_val # in actual: $act_val"
142 is_eq=0
143 [[ $exp_val == $act_val ]] && is_eq=1
144 if (($is_eq ^ $is_neg))
145 then
146 echo "$5 - '$k_ref' actual value ($act_val) matches expected ($exp_val) (negation: $is_neg)"
147 for k in "${!sc_data[@]}"; do out_array[$k]=${sc_data[$k]}; done
148 if [[ -n "$saved_echo_flag" ]]; then set -x; fi
149 return 0
150 fi
151 done
152 done
153
154 echo "$5: wait_any_cond(): failure. Note: query-active=${sc_data['query_active']}"
155 if [[ -n "$saved_echo_flag" ]]; then set -x; fi
156 return 1
157 }
158
159
160 # schedule_against_expected()
161 #
162 # Compare the scrub scheduling state collected by extract_published_sch() to a set of expected values.
163 # All values are expected to match.
164 #
165 # $1: the published scheduling state
166 # $2: a set of conditions to verify
167 # $3: text to be echoed for a failed match
168 #
169 function schedule_against_expected() {
170 local -n dict=$1 # a ref to the published state
171 local -n ep=$2 # the expected results
172 local extr_dbg=1
173
174 # turn off '-x' (but remember previous state)
175 local saved_echo_flag=${-//[^x]/}
176 set +x
177
178 (( extr_dbg >= 1 )) && echo "-- - comparing:"
179 for k_ref in "${!ep[@]}"
180 do
181 local act_val=${dict[$k_ref]}
182 local exp_val=${ep[$k_ref]}
183 (( extr_dbg >= 1 )) && echo "key is " $k_ref " expected: " $exp_val " in actual: " $act_val
184 if [[ $exp_val != $act_val ]]
185 then
186 echo "$3 - '$k_ref' actual value ($act_val) differs from expected ($exp_val)"
187 echo '####################################################^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^'
188
189 if [[ -n "$saved_echo_flag" ]]; then set -x; fi
190 return 1
191 fi
192 done
193
194 if [[ -n "$saved_echo_flag" ]]; then set -x; fi
195 return 0
196 }
197
198
199 # Start the cluster "nodes" and create a pool for testing.
200 #
201 # The OSDs are started with a set of parameters aimed in creating a repeatable
202 # and stable scrub sequence:
203 # - no scrub randomizations/backoffs
204 # - no autoscaler
205 #
206 # $1: the test directory
207 # $2: [in/out] an array of configuration values
208 #
209 # The function adds/updates the configuration dictionary with the name of the
210 # pool created, and its ID.
211 #
212 # Argument 2 might look like this:
213 #
214 # declare -A test_conf=(
215 # ['osds_num']="3"
216 # ['pgs_in_pool']="7"
217 # ['extras']="--extra1 --extra2"
218 # ['pool_name']="testpl"
219 # )
220 function standard_scrub_cluster() {
221 local dir=$1
222 local -n args=$2
223
224 local OSDS=${args['osds_num']:-"3"}
225 local pg_num=${args['pgs_in_pool']:-"8"}
226 local poolname="${args['pool_name']:-test}"
227 args['pool_name']=$poolname
228 local extra_pars=${args['extras']}
229 local debug_msg=${args['msg']:-"dbg"}
230
231 # turn off '-x' (but remember previous state)
232 local saved_echo_flag=${-//[^x]/}
233 set +x
234
235 run_mon $dir a --osd_pool_default_size=$OSDS || return 1
236 run_mgr $dir x || return 1
237
238 local ceph_osd_args="--osd_deep_scrub_randomize_ratio=0 \
239 --osd_scrub_interval_randomize_ratio=0 \
240 --osd_scrub_backoff_ratio=0.0 \
241 --osd_pool_default_pg_autoscale_mode=off \
242 --osd_pg_stat_report_interval_max=1 \
243 $extra_pars"
244
245 for osd in $(seq 0 $(expr $OSDS - 1))
246 do
247 run_osd $dir $osd $(echo $ceph_osd_args) || return 1
248 done
249
250 create_pool $poolname $pg_num $pg_num
251 wait_for_clean || return 1
252
253 # update the in/out 'args' with the ID of the new pool
254 sleep 1
255 name_n_id=`ceph osd dump | awk '/^pool.*'$poolname'/ { gsub(/'"'"'/," ",$3); print $3," ", $2}'`
256 echo "standard_scrub_cluster: $debug_msg: test pool is $name_n_id"
257 args['pool_id']="${name_n_id##* }"
258 args['osd_args']=$ceph_osd_args
259 if [[ -n "$saved_echo_flag" ]]; then set -x; fi
260 }
261
262
263 # Start the cluster "nodes" and create a pool for testing - wpq version.
264 #
265 # A variant of standard_scrub_cluster() that selects the wpq scheduler and sets a value to
266 # osd_scrub_sleep. To be used when the test is attempting to "catch" the scrubber during an
267 # ongoing scrub.
268 #
269 # See standard_scrub_cluster() for more details.
270 #
271 # $1: the test directory
272 # $2: [in/out] an array of configuration values
273 # $3: osd_scrub_sleep
274 #
275 # The function adds/updates the configuration dictionary with the name of the
276 # pool created, and its ID.
277 function standard_scrub_wpq_cluster() {
278 local dir=$1
279 local -n conf=$2
280 local osd_sleep=$3
281
282 conf['extras']=" --osd_op_queue=wpq --osd_scrub_sleep=$osd_sleep ${conf['extras']}"
283
284 standard_scrub_cluster $dir conf || return 1
285 }
286
287
288 # A debug flag is set for the PG specified, causing the 'pg query' command to display
289 # an additional 'scrub sessions counter' field.
290 #
291 # $1: PG id
292 #
293 function set_query_debug() {
294 local pgid=$1
295 local prim_osd=`ceph pg dump pgs_brief | \
296 awk -v pg="^$pgid" -n -e '$0 ~ pg { print(gensub(/[^0-9]*([0-9]+).*/,"\\\\1","g",$5)); }' `
297
298 echo "Setting scrub debug data. Primary for $pgid is $prim_osd"
299 CEPH_ARGS='' ceph --format=json daemon $(get_asok_path osd.$prim_osd) \
300 scrubdebug $pgid set sessions
301 }
302