]> git.proxmox.com Git - ceph.git/blame - ceph/qa/workunits/ceph-helpers.sh
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / qa / workunits / ceph-helpers.sh
CommitLineData
7c673cae
FG
1#!/bin/bash
2#
3# Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
4# Copyright (C) 2014,2015 Red Hat <contact@redhat.com>
5# Copyright (C) 2014 Federico Gimenez <fgimenez@coit.es>
6#
7# Author: Loic Dachary <loic@dachary.org>
8# Author: Federico Gimenez <fgimenez@coit.es>
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU Library Public License as published by
12# the Free Software Foundation; either version 2, or (at your option)
13# any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU Library Public License for more details.
19#
20TIMEOUT=300
21PG_NUM=4
22: ${CEPH_BUILD_VIRTUALENV:=/tmp}
23
24if type xmlstarlet > /dev/null 2>&1; then
25 XMLSTARLET=xmlstarlet
26elif type xml > /dev/null 2>&1; then
27 XMLSTARLET=xml
28else
29 echo "Missing xmlstarlet binary!"
30 exit 1
31fi
32if [ `uname` = FreeBSD ]; then
33 SED=gsed
34else
35 SED=sed
36fi
37
38#! @file ceph-helpers.sh
39# @brief Toolbox to manage Ceph cluster dedicated to testing
40#
41# Example use case:
42#
43# ~~~~~~~~~~~~~~~~{.sh}
44# source ceph-helpers.sh
45#
46# function mytest() {
47# # cleanup leftovers and reset mydir
48# setup mydir
49# # create a cluster with one monitor and three osds
50# run_mon mydir a
51# run_osd mydir 0
52# run_osd mydir 2
53# run_osd mydir 3
54# # put and get an object
55# rados --pool rbd put GROUP /etc/group
56# rados --pool rbd get GROUP /tmp/GROUP
57# # stop the cluster and cleanup the directory
58# teardown mydir
59# }
60# ~~~~~~~~~~~~~~~~
61#
62# The focus is on simplicity and efficiency, in the context of
63# functional tests. The output is intentionally very verbose
64# and functions return as soon as an error is found. The caller
65# is also expected to abort on the first error so that debugging
66# can be done by looking at the end of the output.
67#
68# Each function is documented, implemented and tested independently.
69# When modifying a helper, the test and the documentation are
70# expected to be updated and it is easier of they are collocated. A
71# test for a given function can be run with
72#
73# ~~~~~~~~~~~~~~~~{.sh}
74# ceph-helpers.sh TESTS test_get_osds
75# ~~~~~~~~~~~~~~~~
76#
77# and all the tests (i.e. all functions matching test_*) are run
78# with:
79#
80# ~~~~~~~~~~~~~~~~{.sh}
81# ceph-helpers.sh TESTS
82# ~~~~~~~~~~~~~~~~
83#
84# A test function takes a single argument : the directory dedicated
85# to the tests. It is expected to not create any file outside of this
86# directory and remove it entirely when it completes successfully.
87#
88
89
90##
91# Cleanup any leftovers found in **dir** via **teardown**
92# and reset **dir** as an empty environment.
93#
94# @param dir path name of the environment
95# @return 0 on success, 1 on error
96#
97function setup() {
98 local dir=$1
99 teardown $dir || return 1
100 mkdir -p $dir
101}
102
103function test_setup() {
104 local dir=$dir
105 setup $dir || return 1
106 test -d $dir || return 1
107 setup $dir || return 1
108 test -d $dir || return 1
109 teardown $dir
110}
111
112#######################################################################
113
114##
115# Kill all daemons for which a .pid file exists in **dir** and remove
116# **dir**. If the file system in which **dir** is btrfs, delete all
117# subvolumes that relate to it.
118#
119# @param dir path name of the environment
120# @return 0 on success, 1 on error
121#
122function teardown() {
123 local dir=$1
124 kill_daemons $dir KILL
125 if [ `uname` != FreeBSD ] \
126 && [ $(stat -f -c '%T' .) == "btrfs" ]; then
127 __teardown_btrfs $dir
128 fi
129 rm -fr $dir
130}
131
132function __teardown_btrfs() {
133 local btrfs_base_dir=$1
134 local btrfs_root=$(df -P . | tail -1 | awk '{print $NF}')
135 local btrfs_dirs=$(cd $btrfs_base_dir; sudo btrfs subvolume list . -t | awk '/^[0-9]/ {print $4}' | grep "$btrfs_base_dir/$btrfs_dir")
136 for subvolume in $btrfs_dirs; do
137 sudo btrfs subvolume delete $btrfs_root/$subvolume
138 done
139}
140
141function test_teardown() {
142 local dir=$dir
143 setup $dir || return 1
144 teardown $dir || return 1
145 ! test -d $dir || return 1
146}
147
148#######################################################################
149
150##
151# Sends a signal to a single daemon.
152# This is a helper function for kill_daemons
153#
154# After the daemon is sent **signal**, its actual termination
155# will be verified by sending it signal 0. If the daemon is
156# still alive, kill_daemon will pause for a few seconds and
157# try again. This will repeat for a fixed number of times
158# before kill_daemon returns on failure. The list of
159# sleep intervals can be specified as **delays** and defaults
160# to:
161#
162# 0.1 0.2 1 1 1 2 3 5 5 5 10 10 20 60 60 60 120
163#
164# This sequence is designed to run first a very short sleep time (0.1)
165# if the machine is fast enough and the daemon terminates in a fraction of a
166# second. The increasing sleep numbers should give plenty of time for
167# the daemon to die even on the slowest running machine. If a daemon
168# takes more than a few minutes to stop (the sum of all sleep times),
169# there probably is no point in waiting more and a number of things
170# are likely to go wrong anyway: better give up and return on error.
171#
172# @param pid the process id to send a signal
173# @param send_signal the signal to send
174# @param delays sequence of sleep times before failure
175#
176function kill_daemon() {
177 set -x
178 local pid=$(cat $1)
179 local send_signal=$2
180 local delays=${3:-0.1 0.2 1 1 1 2 3 5 5 5 10 10 20 60 60 60 120}
181 local exit_code=1
182 for try in $delays ; do
183 if kill -$send_signal $pid 2> /dev/null ; then
184 exit_code=1
185 else
186 exit_code=0
187 break
188 fi
189 send_signal=0
190 sleep $try
191 done;
192 return $exit_code
193}
194
195function test_kill_daemon() {
196 local dir=$1
197 setup $dir || return 1
198 run_mon $dir a --osd_pool_default_size=1 || return 1
199 run_mgr $dir x || return 1
200 run_osd $dir 0 || return 1
201
202 name_prefix=osd
203 for pidfile in $(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid') ; do
204 #
205 # sending signal 0 won't kill the daemon
206 # waiting just for one second instead of the default schedule
207 # allows us to quickly verify what happens when kill fails
208 # to stop the daemon (i.e. it must return false)
209 #
210 ! kill_daemon $pidfile 0 1 || return 1
211 #
212 # killing just the osd and verify the mon still is responsive
213 #
214 kill_daemon $pidfile TERM || return 1
215 done
216
217 ceph osd dump | grep "osd.0 down" || return 1
218
219 name_prefix=mgr
220 for pidfile in $(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid') ; do
221 #
222 # kill the mgr
223 #
224 kill_daemon $pidfile TERM || return 1
225 done
226
227 name_prefix=mon
228 for pidfile in $(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid') ; do
229 #
230 # kill the mon and verify it cannot be reached
231 #
232 kill_daemon $pidfile TERM || return 1
233 ! timeout 60 ceph --connect-timeout 60 status || return 1
234 done
235
236 teardown $dir || return 1
237}
238
239##
240# Kill all daemons for which a .pid file exists in **dir**. Each
241# daemon is sent a **signal** and kill_daemons waits for it to exit
242# during a few minutes. By default all daemons are killed. If a
243# **name_prefix** is provided, only the daemons for which a pid
244# file is found matching the prefix are killed. See run_osd and
245# run_mon for more information about the name conventions for
246# the pid files.
247#
248# Send TERM to all daemons : kill_daemons $dir
249# Send KILL to all daemons : kill_daemons $dir KILL
250# Send KILL to all osds : kill_daemons $dir KILL osd
251# Send KILL to osd 1 : kill_daemons $dir KILL osd.1
252#
253# If a daemon is sent the TERM signal and does not terminate
254# within a few minutes, it will still be running even after
255# kill_daemons returns.
256#
257# If all daemons are kill successfully the function returns 0
258# if at least one daemon remains, this is treated as an
259# error and the function return 1.
260#
261# @param dir path name of the environment
262# @param signal name of the first signal (defaults to TERM)
263# @param name_prefix only kill match daemons (defaults to all)
264# @param delays sequence of sleep times before failure
265# @return 0 on success, 1 on error
266#
267function kill_daemons() {
268 local trace=$(shopt -q -o xtrace && echo true || echo false)
269 $trace && shopt -u -o xtrace
270 local dir=$1
271 local signal=${2:-TERM}
272 local name_prefix=$3 # optional, osd, mon, osd.1
273 local delays=$4 #optional timing
274 local status=0
275 local pids=""
276
277 for pidfile in $(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid') ; do
278 run_in_background pids kill_daemon $pidfile $signal $delays
279 done
280
281 wait_background pids
282 status=$?
283
284 $trace && shopt -s -o xtrace
285 return $status
286}
287
288function test_kill_daemons() {
289 local dir=$1
290 setup $dir || return 1
291 run_mon $dir a --osd_pool_default_size=1 || return 1
292 run_mgr $dir x || return 1
293 run_osd $dir 0 || return 1
294 #
295 # sending signal 0 won't kill the daemon
296 # waiting just for one second instead of the default schedule
297 # allows us to quickly verify what happens when kill fails
298 # to stop the daemon (i.e. it must return false)
299 #
300 ! kill_daemons $dir 0 osd 1 || return 1
301 #
302 # killing just the osd and verify the mon still is responsive
303 #
304 kill_daemons $dir TERM osd || return 1
305 ceph osd dump | grep "osd.0 down" || return 1
306 #
307 # kill the mgr
308 #
309 kill_daemons $dir TERM mgr || return 1
310 #
311 # kill the mon and verify it cannot be reached
312 #
313 kill_daemons $dir TERM || return 1
314 ! timeout 60 ceph --connect-timeout 60 status || return 1
315 teardown $dir || return 1
316}
317
318#######################################################################
319
320##
321# Run a monitor by the name mon.**id** with data in **dir**/**id**.
322# The logs can be found in **dir**/mon.**id**.log and the pid file
323# is **dir**/mon.**id**.pid and the admin socket is
324# **dir**/**id**/ceph-mon.**id**.asok.
325#
326# The remaining arguments are passed verbatim to ceph-mon --mkfs
327# and the ceph-mon daemon.
328#
329# Two mandatory arguments must be provided: --fsid and --mon-host
330# Instead of adding them to every call to run_mon, they can be
331# set in the CEPH_ARGS environment variable to be read implicitly
332# by every ceph command.
333#
334# The CEPH_CONF variable is expected to be set to /dev/null to
335# only rely on arguments for configuration.
336#
337# Examples:
338#
339# CEPH_ARGS="--fsid=$(uuidgen) "
340# CEPH_ARGS+="--mon-host=127.0.0.1:7018 "
341# run_mon $dir a # spawn a mon and bind port 7018
342# run_mon $dir a --debug-filestore=20 # spawn with filestore debugging
343#
344# If mon_initial_members is not set, the default rbd pool is deleted
345# and replaced with a replicated pool with less placement groups to
346# speed up initialization. If mon_initial_members is set, no attempt
347# is made to recreate the rbd pool because it would hang forever,
348# waiting for other mons to join.
349#
350# A **dir**/ceph.conf file is created but not meant to be used by any
351# function. It is convenient for debugging a failure with:
352#
353# ceph --conf **dir**/ceph.conf -s
354#
355# @param dir path name of the environment
356# @param id mon identifier
357# @param ... can be any option valid for ceph-mon
358# @return 0 on success, 1 on error
359#
360function run_mon() {
361 local dir=$1
362 shift
363 local id=$1
364 shift
365 local data=$dir/$id
366
367 ceph-mon \
368 --id $id \
369 --mkfs \
370 --mon-data=$data \
371 --run-dir=$dir \
372 "$@" || return 1
373
374 ceph-mon \
375 --id $id \
376 --mon-osd-full-ratio=.99 \
377 --mon-data-avail-crit=1 \
378 --paxos-propose-interval=0.1 \
379 --osd-crush-chooseleaf-type=0 \
380 --erasure-code-dir=$CEPH_LIB \
381 --plugin-dir=$CEPH_LIB \
382 --debug-mon 20 \
383 --debug-ms 20 \
384 --debug-paxos 20 \
385 --chdir= \
386 --mon-data=$data \
387 --log-file=$dir/\$name.log \
388 --admin-socket=$dir/\$cluster-\$name.asok \
389 --mon-cluster-log-file=$dir/log \
390 --run-dir=$dir \
391 --pid-file=$dir/\$name.pid \
392 --mon-allow-pool-delete \
393 "$@" || return 1
394
395 cat > $dir/ceph.conf <<EOF
396[global]
397fsid = $(get_config mon $id fsid)
398mon host = $(get_config mon $id mon_host)
399EOF
400 if test -z "$(get_config mon $id mon_initial_members)" ; then
401 ceph osd pool delete rbd rbd --yes-i-really-really-mean-it || return 1
402 ceph osd pool create rbd $PG_NUM || return 1
403 ceph osd set-backfillfull-ratio .99
404 fi
405}
406
407function test_run_mon() {
408 local dir=$1
409
410 setup $dir || return 1
411
412 run_mon $dir a --mon-initial-members=a || return 1
413 # rbd has not been deleted / created, hence it has pool id 0
414 ceph osd dump | grep "pool 0 'rbd'" || return 1
415 kill_daemons $dir || return 1
416
417 run_mon $dir a || return 1
418 # rbd has been deleted / created, hence it does not have pool id 0
419 ! ceph osd dump | grep "pool 0 'rbd'" || return 1
420 local size=$(CEPH_ARGS='' ceph --format=json daemon $dir/ceph-mon.a.asok \
421 config get osd_pool_default_size)
422 test "$size" = '{"osd_pool_default_size":"3"}' || return 1
423
424 ! CEPH_ARGS='' ceph status || return 1
425 CEPH_ARGS='' ceph --conf $dir/ceph.conf status || return 1
426
427 kill_daemons $dir || return 1
428
429 run_mon $dir a --osd_pool_default_size=1 || return 1
430 local size=$(CEPH_ARGS='' ceph --format=json daemon $dir/ceph-mon.a.asok \
431 config get osd_pool_default_size)
432 test "$size" = '{"osd_pool_default_size":"1"}' || return 1
433 kill_daemons $dir || return 1
434
435 CEPH_ARGS="$CEPH_ARGS --osd_pool_default_size=2" \
436 run_mon $dir a || return 1
437 local size=$(CEPH_ARGS='' ceph --format=json daemon $dir/ceph-mon.a.asok \
438 config get osd_pool_default_size)
439 test "$size" = '{"osd_pool_default_size":"2"}' || return 1
440 kill_daemons $dir || return 1
441
442 teardown $dir || return 1
443}
444
445#######################################################################
446
447function run_mgr() {
448 local dir=$1
449 shift
450 local id=$1
451 shift
452 local data=$dir/$id
453
454 ceph-mgr \
455 --id $id \
456 --erasure-code-dir=$CEPH_LIB \
457 --plugin-dir=$CEPH_LIB \
458 --debug-mgr 20 \
459 --debug-objecter 20 \
460 --debug-ms 20 \
461 --debug-paxos 20 \
462 --chdir= \
463 --mgr-data=$data \
464 --log-file=$dir/\$name.log \
465 --admin-socket=$dir/\$cluster-\$name.asok \
466 --run-dir=$dir \
467 --pid-file=$dir/\$name.pid \
468 "$@" || return 1
469}
470
471#######################################################################
472
473##
474# Create (prepare) and run (activate) an osd by the name osd.**id**
475# with data in **dir**/**id**. The logs can be found in
476# **dir**/osd.**id**.log, the pid file is **dir**/osd.**id**.pid and
477# the admin socket is **dir**/**id**/ceph-osd.**id**.asok.
478#
479# The remaining arguments are passed verbatim to ceph-osd.
480#
481# Two mandatory arguments must be provided: --fsid and --mon-host
482# Instead of adding them to every call to run_osd, they can be
483# set in the CEPH_ARGS environment variable to be read implicitly
484# by every ceph command.
485#
486# The CEPH_CONF variable is expected to be set to /dev/null to
487# only rely on arguments for configuration.
488#
489# The run_osd function creates the OSD data directory with ceph-disk
490# prepare on the **dir**/**id** directory and relies on the
491# activate_osd function to run the daemon.
492#
493# Examples:
494#
495# CEPH_ARGS="--fsid=$(uuidgen) "
496# CEPH_ARGS+="--mon-host=127.0.0.1:7018 "
497# run_osd $dir 0 # prepare and activate an osd using the monitor listening on 7018
498#
499# @param dir path name of the environment
500# @param id osd identifier
501# @param ... can be any option valid for ceph-osd
502# @return 0 on success, 1 on error
503#
504function run_osd() {
505 local dir=$1
506 shift
507 local id=$1
508 shift
509 local osd_data=$dir/$id
510
511 local ceph_disk_args
512 ceph_disk_args+=" --statedir=$dir"
513 ceph_disk_args+=" --sysconfdir=$dir"
514 ceph_disk_args+=" --prepend-to-path="
515
516 mkdir -p $osd_data
517 ceph-disk $ceph_disk_args \
518 prepare $osd_data || return 1
519
520 activate_osd $dir $id "$@"
521}
522
523function run_osd_bluestore() {
524 local dir=$1
525 shift
526 local id=$1
527 shift
528 local osd_data=$dir/$id
529
530 local ceph_disk_args
531 ceph_disk_args+=" --statedir=$dir"
532 ceph_disk_args+=" --sysconfdir=$dir"
533 ceph_disk_args+=" --prepend-to-path="
534
535 mkdir -p $osd_data
536 ceph-disk $ceph_disk_args \
537 prepare --bluestore $osd_data || return 1
538
539 local ceph_osd_args
540 ceph_osd_args+=" --enable-experimental-unrecoverable-data-corrupting-features=bluestore"
541 activate_osd $dir $id $ceph_osd_args "$@"
542}
543
544function test_run_osd() {
545 local dir=$1
546
547 setup $dir || return 1
548
549 run_mon $dir a || return 1
550 run_mgr $dir x || return 1
551
552 run_osd $dir 0 || return 1
553 local backfills=$(CEPH_ARGS='' ceph --format=json daemon $dir//ceph-osd.0.asok \
554 config get osd_max_backfills)
555 echo "$backfills" | grep --quiet 'osd_max_backfills' || return 1
556
557 run_osd $dir 1 --osd-max-backfills 20 || return 1
558 local backfills=$(CEPH_ARGS='' ceph --format=json daemon $dir//ceph-osd.1.asok \
559 config get osd_max_backfills)
560 test "$backfills" = '{"osd_max_backfills":"20"}' || return 1
561
562 CEPH_ARGS="$CEPH_ARGS --osd-max-backfills 30" run_osd $dir 2 || return 1
563 local backfills=$(CEPH_ARGS='' ceph --format=json daemon $dir//ceph-osd.2.asok \
564 config get osd_max_backfills)
565 test "$backfills" = '{"osd_max_backfills":"30"}' || return 1
566
567 teardown $dir || return 1
568}
569
570#######################################################################
571
572##
573# Shutdown and remove all traces of the osd by the name osd.**id**.
574#
575# The OSD is shutdown with the TERM signal. It is then removed from
576# the auth list, crush map, osd map etc and the files associated with
577# it are also removed.
578#
579# @param dir path name of the environment
580# @param id osd identifier
581# @return 0 on success, 1 on error
582#
583function destroy_osd() {
584 local dir=$1
585 local id=$2
586
587 kill_daemons $dir TERM osd.$id || return 1
588 ceph osd out osd.$id || return 1
589 ceph auth del osd.$id || return 1
590 ceph osd crush remove osd.$id || return 1
591 ceph osd rm $id || return 1
592 teardown $dir/$id || return 1
593 rm -fr $dir/$id
594}
595
596function test_destroy_osd() {
597 local dir=$1
598
599 setup $dir || return 1
600 run_mon $dir a || return 1
601 run_mgr $dir x || return 1
602 run_osd $dir 0 || return 1
603 destroy_osd $dir 0 || return 1
604 ! ceph osd dump | grep "osd.$id " || return 1
605 teardown $dir || return 1
606}
607
608#######################################################################
609
610##
611# Run (activate) an osd by the name osd.**id** with data in
612# **dir**/**id**. The logs can be found in **dir**/osd.**id**.log,
613# the pid file is **dir**/osd.**id**.pid and the admin socket is
614# **dir**/**id**/ceph-osd.**id**.asok.
615#
616# The remaining arguments are passed verbatim to ceph-osd.
617#
618# Two mandatory arguments must be provided: --fsid and --mon-host
619# Instead of adding them to every call to activate_osd, they can be
620# set in the CEPH_ARGS environment variable to be read implicitly
621# by every ceph command.
622#
623# The CEPH_CONF variable is expected to be set to /dev/null to
624# only rely on arguments for configuration.
625#
626# The activate_osd function expects a valid OSD data directory
627# in **dir**/**id**, either just created via run_osd or re-using
628# one left by a previous run of ceph-osd. The ceph-osd daemon is
629# run indirectly via ceph-disk activate.
630#
631# The activate_osd function blocks until the monitor reports the osd
632# up. If it fails to do so within $TIMEOUT seconds, activate_osd
633# fails.
634#
635# Examples:
636#
637# CEPH_ARGS="--fsid=$(uuidgen) "
638# CEPH_ARGS+="--mon-host=127.0.0.1:7018 "
639# activate_osd $dir 0 # activate an osd using the monitor listening on 7018
640#
641# @param dir path name of the environment
642# @param id osd identifier
643# @param ... can be any option valid for ceph-osd
644# @return 0 on success, 1 on error
645#
646function activate_osd() {
647 local dir=$1
648 shift
649 local id=$1
650 shift
651 local osd_data=$dir/$id
652
653 local ceph_disk_args
654 ceph_disk_args+=" --statedir=$dir"
655 ceph_disk_args+=" --sysconfdir=$dir"
656 ceph_disk_args+=" --prepend-to-path="
657
658 local ceph_args="$CEPH_ARGS"
659 ceph_args+=" --enable-experimental-unrecoverable-data-corrupting-features=bluestore"
660 ceph_args+=" --osd-failsafe-full-ratio=.99"
661 ceph_args+=" --osd-journal-size=100"
662 ceph_args+=" --osd-scrub-load-threshold=2000"
663 ceph_args+=" --osd-data=$osd_data"
664 ceph_args+=" --chdir="
665 ceph_args+=" --erasure-code-dir=$CEPH_LIB"
666 ceph_args+=" --plugin-dir=$CEPH_LIB"
667 ceph_args+=" --osd-class-dir=$CEPH_LIB"
668 ceph_args+=" --run-dir=$dir"
669 ceph_args+=" --debug-osd=20"
670 ceph_args+=" --log-file=$dir/\$name.log"
671 ceph_args+=" --pid-file=$dir/\$name.pid"
672 ceph_args+=" --osd-max-object-name-len 460"
673 ceph_args+=" --osd-max-object-namespace-len 64"
674 ceph_args+=" "
675 ceph_args+="$@"
676 mkdir -p $osd_data
677 CEPH_ARGS="$ceph_args " ceph-disk $ceph_disk_args \
678 activate \
679 --mark-init=none \
680 $osd_data || return 1
681
682 [ "$id" = "$(cat $osd_data/whoami)" ] || return 1
683
684 wait_for_osd up $id || return 1
685}
686
687function test_activate_osd() {
688 local dir=$1
689
690 setup $dir || return 1
691
692 run_mon $dir a || return 1
693 run_mgr $dir x || return 1
694
695 run_osd $dir 0 || return 1
696 local backfills=$(CEPH_ARGS='' ceph --format=json daemon $dir//ceph-osd.0.asok \
697 config get osd_max_backfills)
698 echo "$backfills" | grep --quiet 'osd_max_backfills' || return 1
699
700 kill_daemons $dir TERM osd || return 1
701
702 activate_osd $dir 0 --osd-max-backfills 20 || return 1
703 local backfills=$(CEPH_ARGS='' ceph --format=json daemon $dir//ceph-osd.0.asok \
704 config get osd_max_backfills)
705 test "$backfills" = '{"osd_max_backfills":"20"}' || return 1
706
707 teardown $dir || return 1
708}
709
710#######################################################################
711
712##
713# Wait until the OSD **id** is either up or down, as specified by
714# **state**. It fails after $TIMEOUT seconds.
715#
716# @param state either up or down
717# @param id osd identifier
718# @return 0 on success, 1 on error
719#
720function wait_for_osd() {
721 local state=$1
722 local id=$2
723
724 status=1
725 for ((i=0; i < $TIMEOUT; i++)); do
726 echo $i
727 if ! ceph osd dump | grep "osd.$id $state"; then
728 sleep 1
729 else
730 status=0
731 break
732 fi
733 done
734 return $status
735}
736
737function test_wait_for_osd() {
738 local dir=$1
739 setup $dir || return 1
740 run_mon $dir a --osd_pool_default_size=1 || return 1
741 run_mgr $dir x || return 1
742 run_osd $dir 0 || return 1
743 wait_for_osd up 0 || return 1
744 kill_daemons $dir TERM osd || return 1
745 wait_for_osd down 0 || return 1
746 ( TIMEOUT=1 ; ! wait_for_osd up 0 ) || return 1
747 teardown $dir || return 1
748}
749
750#######################################################################
751
752##
753# Display the list of OSD ids supporting the **objectname** stored in
754# **poolname**, as reported by ceph osd map.
755#
756# @param poolname an existing pool
757# @param objectname an objectname (may or may not exist)
758# @param STDOUT white space separated list of OSD ids
759# @return 0 on success, 1 on error
760#
761function get_osds() {
762 local poolname=$1
763 local objectname=$2
764
765 local osds=$(ceph --format xml osd map $poolname $objectname 2>/dev/null | \
766 $XMLSTARLET sel -t -m "//acting/osd" -v . -o ' ')
767 # get rid of the trailing space
768 echo $osds
769}
770
771function test_get_osds() {
772 local dir=$1
773
774 setup $dir || return 1
775 run_mon $dir a --osd_pool_default_size=2 || return 1
776 run_mgr $dir x || return 1
777 run_osd $dir 0 || return 1
778 run_osd $dir 1 || return 1
779 wait_for_clean || return 1
780 get_osds rbd GROUP | grep --quiet '^[0-1] [0-1]$' || return 1
781 teardown $dir || return 1
782}
783
784#######################################################################
785
786##
787# Wait for the monitor to form quorum (optionally, of size N)
788#
789# @param timeout duration (lower-bound) to wait for quorum to be formed
790# @param quorumsize size of quorum to wait for
791# @return 0 on success, 1 on error
792#
793function wait_for_quorum() {
794 local timeout=$1
795 local quorumsize=$2
796
797 if [[ -z "$timeout" ]]; then
798 timeout=300
799 fi
800
801 if [[ -z "$quorumsize" ]]; then
802 timeout $timeout ceph mon_status --format=json >&/dev/null || return 1
803 return 0
804 fi
805
806 no_quorum=1
807 wait_until=$((`date +%s` + $timeout))
808 while [[ $(date +%s) -lt $wait_until ]]; do
809 jqfilter='.quorum | length == '$quorumsize
810 jqinput="$(timeout $timeout ceph mon_status --format=json 2>/dev/null)"
811 res=$(echo $jqinput | jq "$jqfilter")
812 if [[ "$res" == "true" ]]; then
813 no_quorum=0
814 break
815 fi
816 done
817 return $no_quorum
818}
819
820#######################################################################
821
822##
823# Return the PG of supporting the **objectname** stored in
824# **poolname**, as reported by ceph osd map.
825#
826# @param poolname an existing pool
827# @param objectname an objectname (may or may not exist)
828# @param STDOUT a PG
829# @return 0 on success, 1 on error
830#
831function get_pg() {
832 local poolname=$1
833 local objectname=$2
834
835 ceph --format xml osd map $poolname $objectname 2>/dev/null | \
836 $XMLSTARLET sel -t -m "//pgid" -v . -n
837}
838
839function test_get_pg() {
840 local dir=$1
841
842 setup $dir || return 1
843 run_mon $dir a --osd_pool_default_size=1 || return 1
844 run_mgr $dir x || return 1
845 run_osd $dir 0 || return 1
846 wait_for_clean || return 1
847 get_pg rbd GROUP | grep --quiet '^[0-9]\.[0-9a-f][0-9a-f]*$' || return 1
848 teardown $dir || return 1
849}
850
851#######################################################################
852
853##
854# Return the value of the **config**, obtained via the config get command
855# of the admin socket of **daemon**.**id**.
856#
857# @param daemon mon or osd
858# @param id mon or osd ID
859# @param config the configuration variable name as found in config_opts.h
860# @param STDOUT the config value
861# @return 0 on success, 1 on error
862#
863function get_config() {
864 local daemon=$1
865 local id=$2
866 local config=$3
867
868 CEPH_ARGS='' \
869 ceph --format xml daemon $dir/ceph-$daemon.$id.asok \
870 config get $config 2> /dev/null | \
871 $XMLSTARLET sel -t -m "//$config" -v . -n
872}
873
874function test_get_config() {
875 local dir=$1
876
877 # override the default config using command line arg and check it
878 setup $dir || return 1
879 run_mon $dir a --osd_pool_default_size=1 || return 1
880 test $(get_config mon a osd_pool_default_size) = 1 || return 1
881 run_mgr $dir x || return 1
882 run_osd $dir 0 --osd_max_scrubs=3 || return 1
883 test $(get_config osd 0 osd_max_scrubs) = 3 || return 1
884 teardown $dir || return 1
885}
886
887#######################################################################
888
889##
890# Set the **config** to specified **value**, via the config set command
891# of the admin socket of **daemon**.**id**
892#
893# @param daemon mon or osd
894# @param id mon or osd ID
895# @param config the configuration variable name as found in config_opts.h
896# @param value the config value
897# @return 0 on success, 1 on error
898#
899function set_config() {
900 local daemon=$1
901 local id=$2
902 local config=$3
903 local value=$4
904
905 CEPH_ARGS='' \
906 ceph --format xml daemon $dir/ceph-$daemon.$id.asok \
907 config set $config $value 2> /dev/null | \
908 $XMLSTARLET sel -Q -t -m "//success" -v .
909}
910
911function test_set_config() {
912 local dir=$1
913
914 setup $dir || return 1
915 run_mon $dir a --osd_pool_default_size=1 || return 1
916 test $(get_config mon a ms_crc_header) = true || return 1
917 set_config mon a ms_crc_header false || return 1
918 test $(get_config mon a ms_crc_header) = false || return 1
919 set_config mon a ms_crc_header true || return 1
920 test $(get_config mon a ms_crc_header) = true || return 1
921 teardown $dir || return 1
922}
923
924#######################################################################
925
926##
927# Return the OSD id of the primary OSD supporting the **objectname**
928# stored in **poolname**, as reported by ceph osd map.
929#
930# @param poolname an existing pool
931# @param objectname an objectname (may or may not exist)
932# @param STDOUT the primary OSD id
933# @return 0 on success, 1 on error
934#
935function get_primary() {
936 local poolname=$1
937 local objectname=$2
938
939 ceph --format xml osd map $poolname $objectname 2>/dev/null | \
940 $XMLSTARLET sel -t -m "//acting_primary" -v . -n
941}
942
943function test_get_primary() {
944 local dir=$1
945
946 setup $dir || return 1
947 run_mon $dir a --osd_pool_default_size=1 || return 1
948 local osd=0
949 run_mgr $dir x || return 1
950 run_osd $dir $osd || return 1
951 wait_for_clean || return 1
952 test $(get_primary rbd GROUP) = $osd || return 1
953 teardown $dir || return 1
954}
955
956#######################################################################
957
958##
959# Return the id of any OSD supporting the **objectname** stored in
960# **poolname**, as reported by ceph osd map, except the primary.
961#
962# @param poolname an existing pool
963# @param objectname an objectname (may or may not exist)
964# @param STDOUT the OSD id
965# @return 0 on success, 1 on error
966#
967function get_not_primary() {
968 local poolname=$1
969 local objectname=$2
970
971 local primary=$(get_primary $poolname $objectname)
972 ceph --format xml osd map $poolname $objectname 2>/dev/null | \
973 $XMLSTARLET sel -t -m "//acting/osd[not(.='$primary')]" -v . -n | \
974 head -1
975}
976
977function test_get_not_primary() {
978 local dir=$1
979
980 setup $dir || return 1
981 run_mon $dir a --osd_pool_default_size=2 || return 1
982 run_mgr $dir x || return 1
983 run_osd $dir 0 || return 1
984 run_osd $dir 1 || return 1
985 wait_for_clean || return 1
986 local primary=$(get_primary rbd GROUP)
987 local not_primary=$(get_not_primary rbd GROUP)
988 test $not_primary != $primary || return 1
989 test $not_primary = 0 -o $not_primary = 1 || return 1
990 teardown $dir || return 1
991}
992
993#######################################################################
994
995##
996# Run ceph-objectstore-tool against the OSD **id** using the data path
997# **dir**. The OSD is killed with TERM prior to running
998# ceph-objectstore-tool because access to the data path is
999# exclusive. The OSD is restarted after the command completes. The
1000# objectstore_tool returns after all PG are active+clean again.
1001#
1002# @param dir the data path of the OSD
1003# @param id the OSD id
1004# @param ... arguments to ceph-objectstore-tool
1005# @param STDIN the input of ceph-objectstore-tool
1006# @param STDOUT the output of ceph-objectstore-tool
1007# @return 0 on success, 1 on error
1008#
1009# The value of $ceph_osd_args will be passed to restarted osds
1010#
1011function objectstore_tool() {
1012 local dir=$1
1013 shift
1014 local id=$1
1015 shift
1016 local osd_data=$dir/$id
1017
1018 local osd_type=$(cat $osd_data/type)
1019
1020 kill_daemons $dir TERM osd.$id >&2 < /dev/null || return 1
1021
1022 local journal_args
1023 if [ "$objectstore_type" == "filestore" ]; then
1024 journal_args=" --journal-path $osd_data/journal"
1025 fi
1026 ceph-objectstore-tool \
1027 --enable-experimental-unrecoverable-data-corrupting-features=bluestore \
1028 --data-path $osd_data \
1029 $journal_args \
1030 "$@" || return 1
1031 activate_osd $dir $id $ceph_osd_args >&2 || return 1
1032 wait_for_clean >&2
1033}
1034
1035function test_objectstore_tool() {
1036 local dir=$1
1037
1038 setup $dir || return 1
1039 run_mon $dir a --osd_pool_default_size=1 || return 1
1040 local osd=0
1041 run_mgr $dir x || return 1
1042 run_osd $dir $osd || return 1
1043 wait_for_clean || return 1
1044 rados --pool rbd put GROUP /etc/group || return 1
1045 objectstore_tool $dir $osd GROUP get-bytes | \
1046 diff - /etc/group
1047 ! objectstore_tool $dir $osd NOTEXISTS get-bytes || return 1
1048 teardown $dir || return 1
1049}
1050
1051#######################################################################
1052
1053##
1054# Predicate checking if there is an ongoing recovery in the
1055# cluster. If any of the recovering_{keys,bytes,objects}_per_sec
1056# counters are reported by ceph status, it means recovery is in
1057# progress.
1058#
1059# @return 0 if recovery in progress, 1 otherwise
1060#
1061function get_is_making_recovery_progress() {
1062 local progress=$(ceph --format xml status 2>/dev/null | \
1063 $XMLSTARLET sel \
1064 -t -m "//pgmap/recovering_keys_per_sec" -v . -o ' ' \
1065 -t -m "//pgmap/recovering_bytes_per_sec" -v . -o ' ' \
1066 -t -m "//pgmap/recovering_objects_per_sec" -v .)
1067 test -n "$progress"
1068}
1069
1070function test_get_is_making_recovery_progress() {
1071 local dir=$1
1072
1073 setup $dir || return 1
1074 run_mon $dir a || return 1
1075 run_mgr $dir x || return 1
1076 ! get_is_making_recovery_progress || return 1
1077 teardown $dir || return 1
1078}
1079
1080#######################################################################
1081
1082##
1083# Return the number of active PGs in the cluster. A PG is active if
1084# ceph pg dump pgs reports it both **active** and **clean** and that
1085# not **stale**.
1086#
1087# @param STDOUT the number of active PGs
1088# @return 0 on success, 1 on error
1089#
1090function get_num_active_clean() {
1091 local expression="("
1092 expression+="contains(.,'active') and "
1093 expression+="contains(.,'clean') and "
1094 expression+="not(contains(.,'stale'))"
1095 expression+=")"
1096 # xmlstarlet 1.3.0 (which is on Ubuntu precise)
1097 # add extra new lines that must be ignored with
1098 # grep -v '^$'
1099 ceph --format xml pg dump pgs 2>/dev/null | \
1100 $XMLSTARLET sel -t -m "//pg_stat/state[$expression]" -v . -n | \
1101 grep -cv '^$'
1102}
1103
1104function test_get_num_active_clean() {
1105 local dir=$1
1106
1107 setup $dir || return 1
1108 run_mon $dir a --osd_pool_default_size=1 || return 1
1109 run_mgr $dir x || return 1
1110 run_osd $dir 0 || return 1
1111 wait_for_clean || return 1
1112 local num_active_clean=$(get_num_active_clean)
1113 test "$num_active_clean" = $PG_NUM || return 1
1114 teardown $dir || return 1
1115}
1116
1117#######################################################################
1118
1119##
1120# Return the number of PGs in the cluster, according to
1121# ceph pg dump pgs.
1122#
1123# @param STDOUT the number of PGs
1124# @return 0 on success, 1 on error
1125#
1126function get_num_pgs() {
1127 ceph --format xml status 2>/dev/null | \
1128 $XMLSTARLET sel -t -m "//pgmap/num_pgs" -v .
1129}
1130
1131function test_get_num_pgs() {
1132 local dir=$1
1133
1134 setup $dir || return 1
1135 run_mon $dir a --osd_pool_default_size=1 || return 1
1136 run_mgr $dir x || return 1
1137 run_osd $dir 0 || return 1
1138 wait_for_clean || return 1
1139 local num_pgs=$(get_num_pgs)
1140 test "$num_pgs" -gt 0 || return 1
1141 teardown $dir || return 1
1142}
1143
1144#######################################################################
1145
1146##
1147# Return the date and time of the last completed scrub for **pgid**,
1148# as reported by ceph pg dump pgs. Note that a repair also sets this
1149# date.
1150#
1151# @param pgid the id of the PG
1152# @param STDOUT the date and time of the last scrub
1153# @return 0 on success, 1 on error
1154#
1155function get_last_scrub_stamp() {
1156 local pgid=$1
1157 local sname=${2:-last_scrub_stamp}
1158 ceph --format xml pg dump pgs 2>/dev/null | \
1159 $XMLSTARLET sel -t -m "//pg_stat[pgid='$pgid']/$sname" -v .
1160}
1161
1162function test_get_last_scrub_stamp() {
1163 local dir=$1
1164
1165 setup $dir || return 1
1166 run_mon $dir a --osd_pool_default_size=1 || return 1
1167 run_mgr $dir x || return 1
1168 run_osd $dir 0 || return 1
1169 wait_for_clean || return 1
1170 stamp=$(get_last_scrub_stamp 1.0)
1171 test -n "$stamp" || return 1
1172 teardown $dir || return 1
1173}
1174
1175#######################################################################
1176
1177##
1178# Predicate checking if the cluster is clean, i.e. all of its PGs are
1179# in a clean state (see get_num_active_clean for a definition).
1180#
1181# @return 0 if the cluster is clean, 1 otherwise
1182#
1183function is_clean() {
1184 num_pgs=$(get_num_pgs)
1185 test $num_pgs != 0 || return 1
1186 test $(get_num_active_clean) = $num_pgs || return 1
1187}
1188
1189function test_is_clean() {
1190 local dir=$1
1191
1192 setup $dir || return 1
1193 run_mon $dir a --osd_pool_default_size=1 || return 1
1194 run_mgr $dir x || return 1
1195 run_osd $dir 0 || return 1
1196 wait_for_clean || return 1
1197 is_clean || return 1
1198 teardown $dir || return 1
1199}
1200
1201#######################################################################
1202
1203##
1204# Return a list of numbers that are increasingly larger and whose
1205# total is **timeout** seconds. It can be used to have short sleep
1206# delay while waiting for an event on a fast machine. But if running
1207# very slowly the larger delays avoid stressing the machine even
1208# further or spamming the logs.
1209#
1210# @param timeout sum of all delays, in seconds
1211# @return a list of sleep delays
1212#
1213function get_timeout_delays() {
1214 local trace=$(shopt -q -o xtrace && echo true || echo false)
1215 $trace && shopt -u -o xtrace
1216 local timeout=$1
1217 local first_step=${2:-1}
1218
1219 local i
1220 local total="0"
1221 i=$first_step
1222 while test "$(echo $total + $i \<= $timeout | bc -l)" = "1"; do
1223 echo -n "$i "
1224 total=$(echo $total + $i | bc -l)
1225 i=$(echo $i \* 2 | bc -l)
1226 done
1227 if test "$(echo $total \< $timeout | bc -l)" = "1"; then
1228 echo -n $(echo $timeout - $total | bc -l)
1229 fi
1230 $trace && shopt -s -o xtrace
1231}
1232
1233function test_get_timeout_delays() {
1234 test "$(get_timeout_delays 1)" = "1 " || return 1
1235 test "$(get_timeout_delays 5)" = "1 2 2" || return 1
1236 test "$(get_timeout_delays 6)" = "1 2 3" || return 1
1237 test "$(get_timeout_delays 7)" = "1 2 4 " || return 1
1238 test "$(get_timeout_delays 8)" = "1 2 4 1" || return 1
1239 test "$(get_timeout_delays 1 .1)" = ".1 .2 .4 .3" || return 1
1240 test "$(get_timeout_delays 1.5 .1)" = ".1 .2 .4 .8 " || return 1
1241 test "$(get_timeout_delays 5 .1)" = ".1 .2 .4 .8 1.6 1.9" || return 1
1242 test "$(get_timeout_delays 6 .1)" = ".1 .2 .4 .8 1.6 2.9" || return 1
1243 test "$(get_timeout_delays 6.3 .1)" = ".1 .2 .4 .8 1.6 3.2 " || return 1
1244 test "$(get_timeout_delays 20 .1)" = ".1 .2 .4 .8 1.6 3.2 6.4 7.3" || return 1
1245}
1246
1247#######################################################################
1248
1249##
1250# Wait until the cluster becomes clean or if it does not make progress
1251# for $TIMEOUT seconds.
1252# Progress is measured either via the **get_is_making_recovery_progress**
1253# predicate or if the number of clean PGs changes (as returned by get_num_active_clean)
1254#
1255# @return 0 if the cluster is clean, 1 otherwise
1256#
1257function wait_for_clean() {
1258 local num_active_clean=-1
1259 local cur_active_clean
1260 local -a delays=($(get_timeout_delays $TIMEOUT .1))
1261 local -i loop=0
1262 test $(get_num_pgs) != 0 || return 1
1263
1264 while true ; do
1265 # Comparing get_num_active_clean & get_num_pgs is used to determine
1266 # if the cluster is clean. That's almost an inline of is_clean() to
1267 # get more performance by avoiding multiple calls of get_num_active_clean.
1268 cur_active_clean=$(get_num_active_clean)
1269 test $cur_active_clean = $(get_num_pgs) && break
1270 if test $cur_active_clean != $num_active_clean ; then
1271 loop=0
1272 num_active_clean=$cur_active_clean
1273 elif get_is_making_recovery_progress ; then
1274 loop=0
1275 elif (( $loop >= ${#delays[*]} )) ; then
1276 ceph report
1277 return 1
1278 fi
1279 sleep ${delays[$loop]}
1280 loop+=1
1281 done
1282 return 0
1283}
1284
1285function test_wait_for_clean() {
1286 local dir=$1
1287
1288 setup $dir || return 1
1289 run_mon $dir a --osd_pool_default_size=1 || return 1
1290 run_mgr $dir x || return 1
1291 ! TIMEOUT=1 wait_for_clean || return 1
1292 run_osd $dir 0 || return 1
1293 wait_for_clean || return 1
1294 teardown $dir || return 1
1295}
1296
1297#######################################################################
1298
1299##
1300# Wait until the cluster becomes HEALTH_OK again or if it does not make progress
1301# for $TIMEOUT seconds.
1302#
1303# @return 0 if the cluster is HEALTHY, 1 otherwise
1304#
1305function wait_for_health() {
1306 local grepstr=$1
1307 local -a delays=($(get_timeout_delays $TIMEOUT .1))
1308 local -i loop=0
1309
1310 while ! ceph health detail | grep "$grepstr" ; do
1311 if (( $loop >= ${#delays[*]} )) ; then
1312 ceph health detail
1313 return 1
1314 fi
1315 sleep ${delays[$loop]}
1316 loop+=1
1317 done
1318}
1319
1320function wait_for_health_ok() {
1321 wait_for_health "HEALTH_OK" || return 1
1322}
1323
1324function test_wait_for_health_ok() {
1325 local dir=$1
1326
1327 setup $dir || return 1
1328 run_mon $dir a --osd_pool_default_size=1 --osd_failsafe_full_ratio=.99 --mon_pg_warn_min_per_osd=0 || return 1
1329 run_mgr $dir x || return 1
1330 ! TIMEOUT=1 wait_for_health_ok || return 1
1331 run_osd $dir 0 || return 1
1332 wait_for_health_ok || return 1
1333 teardown $dir || return 1
1334}
1335
1336
1337#######################################################################
1338
1339##
1340# Run repair on **pgid** and wait until it completes. The repair
1341# function will fail if repair does not complete within $TIMEOUT
1342# seconds.
1343#
1344# @param pgid the id of the PG
1345# @return 0 on success, 1 on error
1346#
1347function repair() {
1348 local pgid=$1
1349 local last_scrub=$(get_last_scrub_stamp $pgid)
1350 ceph pg repair $pgid
1351 wait_for_scrub $pgid "$last_scrub"
1352}
1353
1354function test_repair() {
1355 local dir=$1
1356
1357 setup $dir || return 1
1358 run_mon $dir a --osd_pool_default_size=1 || return 1
1359 run_mgr $dir x || return 1
1360 run_osd $dir 0 || return 1
1361 wait_for_clean || return 1
1362 repair 1.0 || return 1
1363 kill_daemons $dir KILL osd || return 1
1364 ! TIMEOUT=1 repair 1.0 || return 1
1365 teardown $dir || return 1
1366}
1367#######################################################################
1368
1369##
1370# Run scrub on **pgid** and wait until it completes. The pg_scrub
1371# function will fail if repair does not complete within $TIMEOUT
1372# seconds. The pg_scrub is complete whenever the
1373# **get_last_scrub_stamp** function reports a timestamp different from
1374# the one stored before starting the scrub.
1375#
1376# @param pgid the id of the PG
1377# @return 0 on success, 1 on error
1378#
1379function pg_scrub() {
1380 local pgid=$1
1381 local last_scrub=$(get_last_scrub_stamp $pgid)
1382 ceph pg scrub $pgid
1383 wait_for_scrub $pgid "$last_scrub"
1384}
1385
1386function pg_deep_scrub() {
1387 local pgid=$1
1388 local last_scrub=$(get_last_scrub_stamp $pgid last_deep_scrub_stamp)
1389 ceph pg deep-scrub $pgid
1390 wait_for_scrub $pgid "$last_scrub" last_deep_scrub_stamp
1391}
1392
1393function test_pg_scrub() {
1394 local dir=$1
1395
1396 setup $dir || return 1
1397 run_mon $dir a --osd_pool_default_size=1 || return 1
1398 run_mgr $dir x || return 1
1399 run_osd $dir 0 || return 1
1400 wait_for_clean || return 1
1401 pg_scrub 1.0 || return 1
1402 kill_daemons $dir KILL osd || return 1
1403 ! TIMEOUT=1 pg_scrub 1.0 || return 1
1404 teardown $dir || return 1
1405}
1406
1407#######################################################################
1408
1409##
1410# Run the *command* and expect it to fail (i.e. return a non zero status).
1411# The output (stderr and stdout) is stored in a temporary file in *dir*
1412# and is expected to contain the string *expected*.
1413#
1414# Return 0 if the command failed and the string was found. Otherwise
1415# return 1 and cat the full output of the command on stderr for debug.
1416#
1417# @param dir temporary directory to store the output
1418# @param expected string to look for in the output
1419# @param command ... the command and its arguments
1420# @return 0 on success, 1 on error
1421#
1422
1423function expect_failure() {
1424 local dir=$1
1425 shift
1426 local expected="$1"
1427 shift
1428 local success
1429
1430 if "$@" > $dir/out 2>&1 ; then
1431 success=true
1432 else
1433 success=false
1434 fi
1435
1436 if $success || ! grep --quiet "$expected" $dir/out ; then
1437 cat $dir/out >&2
1438 return 1
1439 else
1440 return 0
1441 fi
1442}
1443
1444function test_expect_failure() {
1445 local dir=$1
1446
1447 setup $dir || return 1
1448 expect_failure $dir FAIL bash -c 'echo FAIL ; exit 1' || return 1
1449 # the command did not fail
1450 ! expect_failure $dir FAIL bash -c 'echo FAIL ; exit 0' > $dir/out || return 1
1451 grep --quiet FAIL $dir/out || return 1
1452 # the command failed but the output does not contain the expected string
1453 ! expect_failure $dir FAIL bash -c 'echo UNEXPECTED ; exit 1' > $dir/out || return 1
1454 ! grep --quiet FAIL $dir/out || return 1
1455 teardown $dir || return 1
1456}
1457
1458#######################################################################
1459
1460##
1461# Given the *last_scrub*, wait for scrub to happen on **pgid**. It
1462# will fail if scrub does not complete within $TIMEOUT seconds. The
1463# repair is complete whenever the **get_last_scrub_stamp** function
1464# reports a timestamp different from the one given in argument.
1465#
1466# @param pgid the id of the PG
1467# @param last_scrub timestamp of the last scrub for *pgid*
1468# @return 0 on success, 1 on error
1469#
1470function wait_for_scrub() {
1471 local pgid=$1
1472 local last_scrub="$2"
1473 local sname=${3:-last_scrub_stamp}
1474
1475 for ((i=0; i < $TIMEOUT; i++)); do
1476 if test "$last_scrub" != "$(get_last_scrub_stamp $pgid $sname)" ; then
1477 return 0
1478 fi
1479 sleep 1
1480 done
1481 return 1
1482}
1483
1484function test_wait_for_scrub() {
1485 local dir=$1
1486
1487 setup $dir || return 1
1488 run_mon $dir a --osd_pool_default_size=1 || return 1
1489 run_mgr $dir x || return 1
1490 run_osd $dir 0 || return 1
1491 wait_for_clean || return 1
1492 local pgid=1.0
1493 ceph pg repair $pgid
1494 local last_scrub=$(get_last_scrub_stamp $pgid)
1495 wait_for_scrub $pgid "$last_scrub" || return 1
1496 kill_daemons $dir KILL osd || return 1
1497 last_scrub=$(get_last_scrub_stamp $pgid)
1498 ! TIMEOUT=1 wait_for_scrub $pgid "$last_scrub" || return 1
1499 teardown $dir || return 1
1500}
1501
1502#######################################################################
1503
1504##
1505# Return 0 if the erasure code *plugin* is available, 1 otherwise.
1506#
1507# @param plugin erasure code plugin
1508# @return 0 on success, 1 on error
1509#
1510
1511function erasure_code_plugin_exists() {
1512 local plugin=$1
1513 local status
1514 local grepstr
1515 local s
1516 case `uname` in
1517 FreeBSD) grepstr="Cannot open.*$plugin" ;;
1518 *) grepstr="$plugin.*No such file" ;;
1519 esac
1520
1521 s=$(ceph osd erasure-code-profile set TESTPROFILE plugin=$plugin 2>&1)
1522 local status=$?
1523 if [ $status -eq 0 ]; then
1524 ceph osd erasure-code-profile rm TESTPROFILE
1525 elif ! echo $s | grep --quiet "$grepstr" ; then
1526 status=1
1527 # display why the string was rejected.
1528 echo $s
1529 fi
1530 return $status
1531}
1532
1533function test_erasure_code_plugin_exists() {
1534 local dir=$1
1535
1536 setup $dir || return 1
1537 run_mon $dir a || return 1
1538 run_mgr $dir x || return 1
1539 erasure_code_plugin_exists jerasure || return 1
1540 ! erasure_code_plugin_exists FAKE || return 1
1541 teardown $dir || return 1
1542}
1543
1544#######################################################################
1545
1546##
1547# Display all log files from **dir** on stdout.
1548#
1549# @param dir directory in which all data is stored
1550#
1551
1552function display_logs() {
1553 local dir=$1
1554
1555 find $dir -maxdepth 1 -name '*.log' | \
1556 while read file ; do
1557 echo "======================= $file"
1558 cat $file
1559 done
1560}
1561
1562function test_display_logs() {
1563 local dir=$1
1564
1565 setup $dir || return 1
1566 run_mon $dir a || return 1
1567 kill_daemons $dir || return 1
1568 display_logs $dir > $dir/log.out
1569 grep --quiet mon.a.log $dir/log.out || return 1
1570 teardown $dir || return 1
1571}
1572
1573#######################################################################
1574##
1575# Spawn a command in background and save the pid in the variable name
1576# passed in argument. To make the output reading easier, the output is
1577# prepend with the process id.
1578#
1579# Example:
1580# pids1=""
1581# run_in_background pids1 bash -c 'sleep 1; exit 1'
1582#
1583# @param pid_variable the variable name (not value) where the pids will be stored
1584# @param ... the command to execute
1585# @return only the pid_variable output should be considered and used with **wait_background**
1586#
1587function run_in_background() {
1588 local pid_variable=$1
1589 shift;
1590 # Execute the command and prepend the output with its pid
1591 # We enforce to return the exit status of the command and not the awk one.
1592 ("$@" |& awk '{ a[i++] = $0 }END{for (i = 0; i in a; ++i) { print "'$$': " a[i]} }'; return ${PIPESTATUS[0]}) >&2 &
1593 eval "$pid_variable+=\" $!\""
1594}
1595
1596function test_run_in_background() {
1597 local pids
1598 run_in_background pids sleep 1
1599 run_in_background pids sleep 1
1600 test $(echo $pids | wc -w) = 2 || return 1
1601 wait $pids || return 1
1602}
1603
1604#######################################################################
1605##
1606# Wait for pids running in background to complete.
1607# This function is usually used after a **run_in_background** call
1608# Example:
1609# pids1=""
1610# run_in_background pids1 bash -c 'sleep 1; exit 1'
1611# wait_background pids1
1612#
1613# @param pids The variable name that contains the active PIDS. Set as empty at then end of the function.
1614# @return returns 1 if at least one process exits in error unless returns 0
1615#
1616function wait_background() {
1617 # We extract the PIDS from the variable name
1618 pids=${!1}
1619
1620 return_code=0
1621 for pid in $pids; do
1622 if ! wait $pid; then
1623 # If one process failed then return 1
1624 return_code=1
1625 fi
1626 done
1627
1628 # We empty the variable reporting that all process ended
1629 eval "$1=''"
1630
1631 return $return_code
1632}
1633
1634
1635function test_wait_background() {
1636 local pids=""
1637 run_in_background pids bash -c "sleep 1; exit 1"
1638 run_in_background pids bash -c "sleep 2; exit 0"
1639 wait_background pids
1640 if [ $? -ne 1 ]; then return 1; fi
1641
1642 run_in_background pids bash -c "sleep 1; exit 0"
1643 run_in_background pids bash -c "sleep 2; exit 0"
1644 wait_background pids
1645 if [ $? -ne 0 ]; then return 1; fi
1646
1647 if [ ! -z "$pids" ]; then return 1; fi
1648}
1649
1650#######################################################################
1651
1652##
1653# Call the **run** function (which must be defined by the caller) with
1654# the **dir** argument followed by the caller argument list.
1655#
1656# If the **run** function returns on error, all logs found in **dir**
1657# are displayed for diagnostic purposes.
1658#
1659# **teardown** function is called when the **run** function returns
1660# (on success or on error), to cleanup leftovers. The CEPH_CONF is set
1661# to /dev/null and CEPH_ARGS is unset so that the tests are protected from
1662# external interferences.
1663#
1664# It is the responsibility of the **run** function to call the
1665# **setup** function to prepare the test environment (create a temporary
1666# directory etc.).
1667#
1668# The shell is required (via PS4) to display the function and line
1669# number whenever a statement is executed to help debugging.
1670#
1671# @param dir directory in which all data is stored
1672# @param ... arguments passed transparently to **run**
1673# @return 0 on success, 1 on error
1674#
1675function main() {
1676 local dir=td/$1
1677 shift
1678
1679 shopt -s -o xtrace
1680 PS4='${BASH_SOURCE[0]}:$LINENO: ${FUNCNAME[0]}: '
1681
1682 export PATH=${CEPH_BUILD_VIRTUALENV}/ceph-disk-virtualenv/bin:${CEPH_BUILD_VIRTUALENV}/ceph-detect-init-virtualenv/bin:.:$PATH # make sure program from sources are preferred
1683 #export PATH=$CEPH_ROOT/src/ceph-disk/virtualenv/bin:$CEPH_ROOT/src/ceph-detect-init/virtualenv/bin:.:$PATH # make sure program from sources are preferred
1684
1685 export CEPH_CONF=/dev/null
1686 unset CEPH_ARGS
1687
1688 local code
1689 if run $dir "$@" ; then
1690 code=0
1691 else
1692 display_logs $dir
1693 code=1
1694 fi
1695 teardown $dir || return 1
1696 return $code
1697}
1698
1699#######################################################################
1700
1701function run_tests() {
1702 shopt -s -o xtrace
1703 PS4='${BASH_SOURCE[0]}:$LINENO: ${FUNCNAME[0]}: '
1704
1705 export PATH=${CEPH_BUILD_VIRTUALENV}/ceph-disk-virtualenv/bin:${CEPH_BUILD_VIRTUALENV}/ceph-detect-init-virtualenv/bin:.:$PATH # make sure program from sources are preferred
1706 #export PATH=$CEPH_ROOT/src/ceph-disk/virtualenv/bin:$CEPH_ROOT/src/ceph-detect-init/virtualenv/bin:.:$PATH # make sure program from sources are preferred
1707
1708 export CEPH_MON="127.0.0.1:7109" # git grep '\<7109\>' : there must be only one
1709 export CEPH_ARGS
1710 CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
1711 CEPH_ARGS+="--mon-host=$CEPH_MON "
1712 export CEPH_CONF=/dev/null
1713
1714 local funcs=${@:-$(set | sed -n -e 's/^\(test_[0-9a-z_]*\) .*/\1/p')}
1715 local dir=td/ceph-helpers
1716
1717 for func in $funcs ; do
1718 $func $dir || return 1
1719 done
1720}
1721
1722if test "$1" = TESTS ; then
1723 shift
1724 run_tests "$@"
1725fi
1726
1727# Local Variables:
1728# compile-command: "cd ../../src ; make -j4 && ../qa/workunits/ceph-helpers.sh TESTS # test_get_config"
1729# End: