ceph/qa/standalone/scrub/scrub-helpers.sh

   1 #!/usr/bin/env bash
   2 # @file   scrub-helpers.sh
   3 # @brief  a collection of bash functions useful for scrub standalone tests
   4 #
   5
   6 # extract_published_sch()
   7 #
   8 # Use the output from both 'ceph pg dump pgs' and 'ceph pg x.x query' commands to determine
   9 # the published scrub scheduling status of a given PG.
  10 #
  11 # $1: pg id
  12 # $2: 'current' time to compare to
  13 # $3: an additional time-point to compare to
  14 # $4: [out] dictionary
  15 #
  16 function extract_published_sch() {
  17   local pgn="$1"
  18   local -n dict=$4 # a ref to the in/out dictionary
  19   local current_time=$2
  20   local extra_time=$3
  21   local extr_dbg=1 # note: 3 and above leave some temp files around
  22
  23   #turn off '-x' (but remember previous state)
  24   local saved_echo_flag=${-//[^x]/}
  25   set +x
  26
  27   (( extr_dbg >= 3 )) && ceph pg dump pgs -f json-pretty >> /tmp/a_dmp$$
  28   (( extr_dbg >= 3 )) && ceph pg $1 query -f json-pretty >> /tmp/a_qry$$
  29
  30   from_dmp=`ceph pg dump pgs -f json-pretty | jq -r --arg pgn "$pgn" --arg extra_dt "$extra_time" --arg current_dt "$current_time" '[
  31     [[.pg_stats[]] | group_by(.pg_stats)][0][0] |
  32     [.[] |
  33     select(has("pgid") and .pgid == $pgn) |
  34
  35         (.dmp_stat_part=(.scrub_schedule | if test(".*@.*") then (split(" @ ")|first) else . end)) |
  36         (.dmp_when_part=(.scrub_schedule | if test(".*@.*") then (split(" @ ")|last) else "0" end)) |
  37
  38      [ {
  39        dmp_pg_state: .state,
  40        dmp_state_has_scrubbing: (.state | test(".*scrub.*";"i")),
  41        dmp_last_duration:.last_scrub_duration,
  42        dmp_schedule: .dmp_stat_part,
  43        dmp_schedule_at: .dmp_when_part,
  44        dmp_is_future: ( .dmp_when_part > $current_dt ),
  45        dmp_vs_date: ( .dmp_when_part > $extra_dt  ),
  46        dmp_reported_epoch: .reported_epoch,
  47        dmp_seq: .reported_seq
  48       }] ]][][][]'`
  49
  50   (( extr_dbg >= 2 )) && echo "from pg dump pg: $from_dmp"
  51   (( extr_dbg >= 2 )) && echo "query output:"
  52   (( extr_dbg >= 2 )) && ceph pg $1 query -f json-pretty | awk -e '/scrubber/,/agent_state/ {print;}'
  53
  54   from_qry=`ceph pg $1 query -f json-pretty | jq -r --arg extra_dt "$extra_time" --arg current_dt "$current_time"  --arg spt "'" '
  55     . |
  56         (.q_stat_part=((.scrubber.schedule// "-") | if test(".*@.*") then (split(" @ ")|first) else . end)) |
  57         (.q_when_part=((.scrubber.schedule// "0") | if test(".*@.*") then (split(" @ ")|last) else "0" end)) |
  58         (.q_when_is_future=(.q_when_part > $current_dt)) |
  59         (.q_vs_date=(.q_when_part > $extra_dt)) |
  60       {
  61         query_epoch: .epoch,
  62         query_seq: .info.stats.reported_seq,
  63         query_active: (.scrubber | if has("active") then .active else "bug" end),
  64         query_schedule: .q_stat_part,
  65         query_schedule_at: .q_when_part,
  66         query_last_duration: .info.stats.last_scrub_duration,
  67         query_last_stamp: .info.history.last_scrub_stamp,
  68         query_last_scrub: (.info.history.last_scrub| sub($spt;"x") ),
  69         query_is_future: .q_when_is_future,
  70         query_vs_date: .q_vs_date,
  71         query_scrub_seq: .scrubber.test_sequence
  72       }
  73    '`
  74   (( extr_dbg >= 1 )) && echo $from_qry " " $from_dmp | jq -s -r 'add | "(",(to_entries | .[] | "["+(.key)+"]="+(.value|@sh)),")"'
  75
  76   # note that using a ref to an associative array directly is tricky. Instead - we are copying:
  77   local -A dict_src=`echo $from_qry " " $from_dmp | jq -s -r 'add | "(",(to_entries | .[] | "["+(.key)+"]="+(.value|@sh)),")"'`
  78   dict=()
  79   for k in "${!dict_src[@]}"; do dict[$k]=${dict_src[$k]}; done
  80
  81   if [[ -n "$saved_echo_flag" ]]; then set -x; fi
  82 }
  83
  84 # query the PG, until any of the conditions in the 'expected' array are met
  85 #
  86 # A condition may be negated by an additional entry in the 'expected' array. Its
  87 # form should be:
  88 #  key: the original key, with a "_neg" suffix;
  89 #  Value: not checked
  90 #
  91 # $1: pg id
  92 # $2: max retries
  93 # $3: a date to use in comparisons
  94 # $4: set of K/V conditions
  95 # $5: debug message
  96 # $6: [out] the results array
  97 function wait_any_cond() {
  98   local pgid="$1"
  99   local retries=$2
 100   local cmp_date=$3
 101   local -n ep=$4
 102   local -n out_array=$6
 103   local -A sc_data
 104   local extr_dbg=2
 105
 106   #turn off '-x' (but remember previous state)
 107   local saved_echo_flag=${-//[^x]/}
 108   set +x
 109
 110   local now_is=`date -I"ns"`
 111   (( extr_dbg >= 2 )) && echo "waiting for any condition ($5): pg:$pgid dt:$cmp_date ($retries retries)"
 112
 113   for i in $(seq 1 $retries)
 114   do
 115     sleep 0.5
 116     extract_published_sch $pgid $now_is $cmp_date sc_data
 117     (( extr_dbg >= 4 )) && echo "${sc_data['dmp_last_duration']}"
 118     (( extr_dbg >= 4 )) && echo "----> loop:  $i  ~ ${sc_data['dmp_last_duration']}  / " ${sc_data['query_vs_date']} " /   ${sc_data['dmp_is_future']}"
 119     (( extr_dbg >= 2 )) && echo "--> loop:  $i ~ ${sc_data['query_active']} / ${sc_data['query_seq']} / ${sc_data['dmp_seq']} " \
 120                       "/ ${sc_data['query_is_future']} / ${sc_data['query_last_stamp']} / ${sc_data['query_schedule']} %%% ${!ep[@]}"
 121
 122     # perform schedule_against_expected(), but with slightly different out-messages behaviour
 123     for k_ref in "${!ep[@]}"
 124     do
 125       (( extr_dbg >= 3 )) && echo "key is $k_ref"
 126       # is this a real key, or just a negation flag for another key??
 127       [[ $k_ref =~ "_neg" ]] && continue
 128
 129       local act_val=${sc_data[$k_ref]}
 130       local exp_val=${ep[$k_ref]}
 131
 132       # possible negation? look for a matching key
 133       local neg_key="${k_ref}_neg"
 134       (( extr_dbg >= 3 )) && echo "neg-key is $neg_key"
 135       if [ -v 'ep[$neg_key]' ]; then
 136         is_neg=1
 137       else
 138         is_neg=0
 139       fi
 140
 141       (( extr_dbg >= 1 )) && echo "key is $k_ref: negation:$is_neg # expected: $exp_val # in actual: $act_val"
 142       is_eq=0
 143       [[ $exp_val == $act_val ]] && is_eq=1
 144       if (($is_eq ^ $is_neg))
 145       then
 146         echo "$5 - '$k_ref' actual value ($act_val) matches expected ($exp_val) (negation: $is_neg)"
 147         for k in "${!sc_data[@]}"; do out_array[$k]=${sc_data[$k]}; done
 148         if [[ -n "$saved_echo_flag" ]]; then set -x; fi
 149         return 0
 150       fi
 151     done
 152   done
 153
 154   echo "$5: wait_any_cond(): failure. Note: query-active=${sc_data['query_active']}"
 155   if [[ -n "$saved_echo_flag" ]]; then set -x; fi
 156   return 1
 157 }
 158
 159
 160 # schedule_against_expected()
 161 #
 162 # Compare the scrub scheduling state collected by extract_published_sch() to a set of expected values.
 163 # All values are expected to match.
 164 #
 165 # $1: the published scheduling state
 166 # $2: a set of conditions to verify
 167 # $3: text to be echoed for a failed match
 168 #
 169 function schedule_against_expected() {
 170   local -n dict=$1 # a ref to the published state
 171   local -n ep=$2  # the expected results
 172   local extr_dbg=1
 173
 174   # turn off '-x' (but remember previous state)
 175   local saved_echo_flag=${-//[^x]/}
 176   set +x
 177
 178   (( extr_dbg >= 1 )) && echo "-- - comparing:"
 179   for k_ref in "${!ep[@]}"
 180   do
 181     local act_val=${dict[$k_ref]}
 182     local exp_val=${ep[$k_ref]}
 183     (( extr_dbg >= 1 )) && echo "key is " $k_ref "  expected: " $exp_val " in actual: " $act_val
 184     if [[ $exp_val != $act_val ]]
 185     then
 186       echo "$3 - '$k_ref' actual value ($act_val) differs from expected ($exp_val)"
 187       echo '####################################################^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^'
 188
 189       if [[ -n "$saved_echo_flag" ]]; then set -x; fi
 190       return 1
 191     fi
 192   done
 193
 194   if [[ -n "$saved_echo_flag" ]]; then set -x; fi
 195   return 0
 196 }
 197
 198
 199 # Start the cluster "nodes" and create a pool for testing.
 200 #
 201 # The OSDs are started with a set of parameters aimed in creating a repeatable
 202 # and stable scrub sequence:
 203 #  - no scrub randomizations/backoffs
 204 #  - no autoscaler
 205 #
 206 # $1: the test directory
 207 # $2: [in/out] an array of configuration values
 208 #
 209 # The function adds/updates the configuration dictionary with the name of the
 210 # pool created, and its ID.
 211 #
 212 # Argument 2 might look like this:
 213 #
 214 #  declare -A test_conf=(
 215 #    ['osds_num']="3"
 216 #    ['pgs_in_pool']="7"
 217 #    ['extras']="--extra1 --extra2"
 218 #    ['pool_name']="testpl"
 219 #  )
 220 function standard_scrub_cluster() {
 221     local dir=$1
 222     local -n args=$2
 223
 224     local OSDS=${args['osds_num']:-"3"}
 225     local pg_num=${args['pgs_in_pool']:-"8"}
 226     local poolname="${args['pool_name']:-test}"
 227     args['pool_name']=$poolname
 228     local extra_pars=${args['extras']}
 229     local debug_msg=${args['msg']:-"dbg"}
 230
 231     # turn off '-x' (but remember previous state)
 232     local saved_echo_flag=${-//[^x]/}
 233     set +x
 234
 235     run_mon $dir a --osd_pool_default_size=$OSDS || return 1
 236     run_mgr $dir x || return 1
 237
 238     local ceph_osd_args="--osd_deep_scrub_randomize_ratio=0 \
 239             --osd_scrub_interval_randomize_ratio=0 \
 240             --osd_scrub_backoff_ratio=0.0 \
 241             --osd_pool_default_pg_autoscale_mode=off \
 242             --osd_pg_stat_report_interval_max=1 \
 243             $extra_pars"
 244
 245     for osd in $(seq 0 $(expr $OSDS - 1))
 246     do
 247       run_osd $dir $osd $(echo $ceph_osd_args) || return 1
 248     done
 249
 250     create_pool $poolname $pg_num $pg_num
 251     wait_for_clean || return 1
 252
 253     # update the in/out 'args' with the ID of the new pool
 254     sleep 1
 255     name_n_id=`ceph osd dump | awk '/^pool.*'$poolname'/ { gsub(/'"'"'/," ",$3); print $3," ", $2}'`
 256     echo "standard_scrub_cluster: $debug_msg: test pool is $name_n_id"
 257     args['pool_id']="${name_n_id##* }"
 258     args['osd_args']=$ceph_osd_args
 259     if [[ -n "$saved_echo_flag" ]]; then set -x; fi
 260 }
 261
 262
 263 # Start the cluster "nodes" and create a pool for testing - wpq version.
 264 #
 265 # A variant of standard_scrub_cluster() that selects the wpq scheduler and sets a value to
 266 # osd_scrub_sleep. To be used when the test is attempting to "catch" the scrubber during an
 267 # ongoing scrub.
 268 #
 269 # See standard_scrub_cluster() for more details.
 270 #
 271 # $1: the test directory
 272 # $2: [in/out] an array of configuration values
 273 # $3: osd_scrub_sleep
 274 #
 275 # The function adds/updates the configuration dictionary with the name of the
 276 # pool created, and its ID.
 277 function standard_scrub_wpq_cluster() {
 278     local dir=$1
 279     local -n conf=$2
 280     local osd_sleep=$3
 281
 282     conf['extras']=" --osd_op_queue=wpq --osd_scrub_sleep=$osd_sleep ${conf['extras']}"
 283
 284     standard_scrub_cluster $dir conf || return 1
 285 }
 286
 287
 288 # A debug flag is set for the PG specified, causing the 'pg query' command to display
 289 # an additional 'scrub sessions counter' field.
 290 #
 291 # $1: PG id
 292 #
 293 function set_query_debug() {
 294     local pgid=$1
 295     local prim_osd=`ceph pg dump pgs_brief | \
 296       awk -v pg="^$pgid" -n -e '$0 ~ pg { print(gensub(/[^0-9]*([0-9]+).*/,"\\\\1","g",$5)); }' `
 297
 298     echo "Setting scrub debug data. Primary for $pgid is $prim_osd"
 299     CEPH_ARGS='' ceph --format=json daemon $(get_asok_path osd.$prim_osd) \
 300           scrubdebug $pgid set sessions
 301 }
 302