]> git.proxmox.com Git - ceph.git/blob - ceph/qa/workunits/cephadm/test_cephadm.sh
7d06a3326083d309b1738a5c8e140be80303466d
[ceph.git] / ceph / qa / workunits / cephadm / test_cephadm.sh
1 #!/bin/bash -ex
2
3 SCRIPT_NAME=$(basename ${BASH_SOURCE[0]})
4 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
5
6 # cleanup during exit
7 [ -z "$CLEANUP" ] && CLEANUP=true
8
9 FSID='00000000-0000-0000-0000-0000deadbeef'
10
11 # images that are used
12 IMAGE_MAIN=${IMAGE_MAIN:-'quay.ceph.io/ceph-ci/ceph:main'}
13 IMAGE_PACIFIC=${IMAGE_PACIFIC:-'quay.ceph.io/ceph-ci/ceph:pacific'}
14 #IMAGE_OCTOPUS=${IMAGE_OCTOPUS:-'quay.ceph.io/ceph-ci/ceph:octopus'}
15 IMAGE_DEFAULT=${IMAGE_MAIN}
16
17 OSD_IMAGE_NAME="${SCRIPT_NAME%.*}_osd.img"
18 OSD_IMAGE_SIZE='6G'
19 OSD_TO_CREATE=2
20 OSD_VG_NAME=${SCRIPT_NAME%.*}
21 OSD_LV_NAME=${SCRIPT_NAME%.*}
22
23 # TMPDIR for test data
24 [ -d "$TMPDIR" ] || TMPDIR=$(mktemp -d tmp.$SCRIPT_NAME.XXXXXX)
25 [ -d "$TMPDIR_TEST_MULTIPLE_MOUNTS" ] || TMPDIR_TEST_MULTIPLE_MOUNTS=$(mktemp -d tmp.$SCRIPT_NAME.XXXXXX)
26
27 CEPHADM_SRC_DIR=${SCRIPT_DIR}/../../../src/cephadm
28 CEPHADM_SAMPLES_DIR=${CEPHADM_SRC_DIR}/samples
29
30 [ -z "$SUDO" ] && SUDO=sudo
31
32 # If cephadm is already installed on the system, use that one, avoid building
33 # # one if we can.
34 if [ -z "$CEPHADM" ] && command -v cephadm >/dev/null ; then
35 CEPHADM="$(command -v cephadm)"
36 fi
37
38 if [ -z "$CEPHADM" ]; then
39 CEPHADM=`mktemp -p $TMPDIR tmp.cephadm.XXXXXX`
40 ${CEPHADM_SRC_DIR}/build.sh "$CEPHADM"
41 NO_BUILD_INFO=1
42 fi
43
44 # at this point, we need $CEPHADM set
45 if ! [ -x "$CEPHADM" ]; then
46 echo "cephadm not found. Please set \$CEPHADM"
47 exit 1
48 fi
49
50 # add image to args
51 CEPHADM_ARGS="$CEPHADM_ARGS --image $IMAGE_DEFAULT"
52
53 # combine into a single var
54 CEPHADM_BIN="$CEPHADM"
55 CEPHADM="$SUDO $CEPHADM_BIN $CEPHADM_ARGS"
56
57 # clean up previous run(s)?
58 $CEPHADM rm-cluster --fsid $FSID --force
59 $SUDO vgchange -an $OSD_VG_NAME || true
60 loopdev=$($SUDO losetup -a | grep $(basename $OSD_IMAGE_NAME) | awk -F : '{print $1}')
61 if ! [ "$loopdev" = "" ]; then
62 $SUDO losetup -d $loopdev
63 fi
64
65 function cleanup()
66 {
67 if [ $CLEANUP = false ]; then
68 # preserve the TMPDIR state
69 echo "========================"
70 echo "!!! CLEANUP=$CLEANUP !!!"
71 echo
72 echo "TMPDIR=$TMPDIR"
73 echo "========================"
74 return
75 fi
76
77 dump_all_logs $FSID
78 rm -rf $TMPDIR
79 }
80 trap cleanup EXIT
81
82 function expect_false()
83 {
84 set -x
85 if eval "$@"; then return 1; else return 0; fi
86 }
87
88 # expect_return_code $expected_code $command ...
89 function expect_return_code()
90 {
91 set -x
92 local expected_code="$1"
93 shift
94 local command="$@"
95
96 set +e
97 eval "$command"
98 local return_code="$?"
99 set -e
100
101 if [ ! "$return_code" -eq "$expected_code" ]; then return 1; else return 0; fi
102 }
103
104 function is_available()
105 {
106 local name="$1"
107 local condition="$2"
108 local tries="$3"
109
110 local num=0
111 while ! eval "$condition"; do
112 num=$(($num + 1))
113 if [ "$num" -ge $tries ]; then
114 echo "$name is not available"
115 false
116 fi
117 sleep 5
118 done
119
120 echo "$name is available"
121 true
122 }
123
124 function dump_log()
125 {
126 local fsid="$1"
127 local name="$2"
128 local num_lines="$3"
129
130 if [ -z $num_lines ]; then
131 num_lines=100
132 fi
133
134 echo '-------------------------'
135 echo 'dump daemon log:' $name
136 echo '-------------------------'
137
138 $CEPHADM logs --fsid $fsid --name $name -- --no-pager -n $num_lines
139 }
140
141 function dump_all_logs()
142 {
143 local fsid="$1"
144 local names=$($CEPHADM ls | jq -r '.[] | select(.fsid == "'$fsid'").name')
145
146 echo 'dumping logs for daemons: ' $names
147 for name in $names; do
148 dump_log $fsid $name
149 done
150 }
151
152 function nfs_stop()
153 {
154 # stop the running nfs server
155 local units="nfs-server nfs-kernel-server"
156 for unit in $units; do
157 if systemctl --no-pager status $unit > /dev/null; then
158 $SUDO systemctl stop $unit
159 fi
160 done
161
162 # ensure the NFS port is no longer in use
163 expect_false "$SUDO ss -tlnp '( sport = :nfs )' | grep LISTEN"
164 }
165
166 ## prepare + check host
167 $SUDO $CEPHADM check-host
168
169 ## run a gather-facts (output to stdout)
170 $SUDO $CEPHADM gather-facts
171
172 ## NOTE: cephadm version is, as of around May 2023, no longer basing the
173 ## output for `cephadm version` on the version of the containers. The version
174 ## reported is that of the "binary" and is determined during the ceph build.
175 ## `cephadm version` should NOT require sudo/root.
176 $CEPHADM_BIN version
177 $CEPHADM_BIN version | grep 'cephadm version'
178 # Typically cmake should be running the cephadm build script with CLI arguments
179 # that embed version info into the "binary". If not using a cephadm build via
180 # cmake you can set `NO_BUILD_INFO` to skip this check.
181 if [ -z "$NO_BUILD_INFO" ]; then
182 $CEPHADM_BIN version | grep -v 'UNSET'
183 $CEPHADM_BIN version | grep -v 'UNKNOWN'
184 fi
185
186
187 ## test shell before bootstrap, when crash dir isn't (yet) present on this host
188 $CEPHADM shell --fsid $FSID -- ceph -v | grep 'ceph version'
189 $CEPHADM shell --fsid $FSID -e FOO=BAR -- printenv | grep FOO=BAR
190
191 # test stdin
192 echo foo | $CEPHADM shell -- cat | grep -q foo
193
194 # the shell commands a bit above this seems to cause the
195 # /var/lib/ceph/<fsid> directory to be made. Since we now
196 # check in bootstrap that there are no clusters with the same
197 # fsid based on the directory existing, we need to make sure
198 # this directory is gone before bootstrapping. We can
199 # accomplish this with another rm-cluster
200 $CEPHADM rm-cluster --fsid $FSID --force
201
202 ## bootstrap
203 ORIG_CONFIG=`mktemp -p $TMPDIR`
204 CONFIG=`mktemp -p $TMPDIR`
205 MONCONFIG=`mktemp -p $TMPDIR`
206 KEYRING=`mktemp -p $TMPDIR`
207 IP=127.0.0.1
208 cat <<EOF > $ORIG_CONFIG
209 [global]
210 log to file = true
211 osd crush chooseleaf type = 0
212 EOF
213 $CEPHADM bootstrap \
214 --mon-id a \
215 --mgr-id x \
216 --mon-ip $IP \
217 --fsid $FSID \
218 --config $ORIG_CONFIG \
219 --output-config $CONFIG \
220 --output-keyring $KEYRING \
221 --output-pub-ssh-key $TMPDIR/ceph.pub \
222 --allow-overwrite \
223 --skip-mon-network \
224 --skip-monitoring-stack
225 test -e $CONFIG
226 test -e $KEYRING
227 rm -f $ORIG_CONFIG
228
229 $SUDO test -e /var/log/ceph/$FSID/ceph-mon.a.log
230 $SUDO test -e /var/log/ceph/$FSID/ceph-mgr.x.log
231
232 for u in ceph.target \
233 ceph-$FSID.target \
234 ceph-$FSID@mon.a \
235 ceph-$FSID@mgr.x; do
236 systemctl is-enabled $u
237 systemctl is-active $u
238 done
239 systemctl | grep system-ceph | grep -q .slice # naming is escaped and annoying
240
241 # check ceph -s works (via shell w/ passed config/keyring)
242 $CEPHADM shell --fsid $FSID --config $CONFIG --keyring $KEYRING -- \
243 ceph -s | grep $FSID
244
245 for t in mon mgr node-exporter prometheus grafana; do
246 $CEPHADM shell --fsid $FSID --config $CONFIG --keyring $KEYRING -- \
247 ceph orch apply $t --unmanaged
248 done
249
250 ## ls
251 $CEPHADM ls | jq '.[]' | jq 'select(.name == "mon.a").fsid' \
252 | grep $FSID
253 $CEPHADM ls | jq '.[]' | jq 'select(.name == "mgr.x").fsid' \
254 | grep $FSID
255
256 # make sure the version is returned correctly
257 $CEPHADM ls | jq '.[]' | jq 'select(.name == "mon.a").version' | grep -q \\.
258
259 ## deploy
260 # add mon.b
261 cp $CONFIG $MONCONFIG
262 echo "public addrv = [v2:$IP:3301,v1:$IP:6790]" >> $MONCONFIG
263 jq --null-input \
264 --arg fsid $FSID \
265 --arg name mon.b \
266 --arg keyring /var/lib/ceph/$FSID/mon.a/keyring \
267 --arg config "$MONCONFIG" \
268 '{"fsid": $fsid, "name": $name, "params":{"keyring": $keyring, "config": $config}}' | \
269 $CEPHADM _orch deploy
270 for u in ceph-$FSID@mon.b; do
271 systemctl is-enabled $u
272 systemctl is-active $u
273 done
274 cond="$CEPHADM shell --fsid $FSID --config $CONFIG --keyring $KEYRING -- \
275 ceph mon stat | grep '2 mons'"
276 is_available "mon.b" "$cond" 30
277
278 # add mgr.y
279 $CEPHADM shell --fsid $FSID --config $CONFIG --keyring $KEYRING -- \
280 ceph auth get-or-create mgr.y \
281 mon 'allow profile mgr' \
282 osd 'allow *' \
283 mds 'allow *' > $TMPDIR/keyring.mgr.y
284 jq --null-input \
285 --arg fsid $FSID \
286 --arg name mgr.y \
287 --arg keyring $TMPDIR/keyring.mgr.y \
288 --arg config "$CONFIG" \
289 '{"fsid": $fsid, "name": $name, "params":{"keyring": $keyring, "config": $config}}' | \
290 $CEPHADM _orch deploy
291 for u in ceph-$FSID@mgr.y; do
292 systemctl is-enabled $u
293 systemctl is-active $u
294 done
295
296 for f in `seq 1 30`; do
297 if $CEPHADM shell --fsid $FSID \
298 --config $CONFIG --keyring $KEYRING -- \
299 ceph -s -f json-pretty \
300 | jq '.mgrmap.num_standbys' | grep -q 1 ; then break; fi
301 sleep 1
302 done
303 $CEPHADM shell --fsid $FSID --config $CONFIG --keyring $KEYRING -- \
304 ceph -s -f json-pretty \
305 | jq '.mgrmap.num_standbys' | grep -q 1
306
307 # add osd.{1,2,..}
308 dd if=/dev/zero of=$TMPDIR/$OSD_IMAGE_NAME bs=1 count=0 seek=$OSD_IMAGE_SIZE
309 loop_dev=$($SUDO losetup -f)
310 $SUDO vgremove -f $OSD_VG_NAME || true
311 $SUDO losetup $loop_dev $TMPDIR/$OSD_IMAGE_NAME
312 $SUDO pvcreate $loop_dev && $SUDO vgcreate $OSD_VG_NAME $loop_dev
313
314 # osd bootstrap keyring
315 $CEPHADM shell --fsid $FSID --config $CONFIG --keyring $KEYRING -- \
316 ceph auth get client.bootstrap-osd > $TMPDIR/keyring.bootstrap.osd
317
318 # create lvs first so ceph-volume doesn't overlap with lv creation
319 for id in `seq 0 $((--OSD_TO_CREATE))`; do
320 $SUDO lvcreate -l $((100/$OSD_TO_CREATE))%VG -n $OSD_LV_NAME.$id $OSD_VG_NAME
321 done
322
323 for id in `seq 0 $((--OSD_TO_CREATE))`; do
324 device_name=/dev/$OSD_VG_NAME/$OSD_LV_NAME.$id
325 CEPH_VOLUME="$CEPHADM ceph-volume \
326 --fsid $FSID \
327 --config $CONFIG \
328 --keyring $TMPDIR/keyring.bootstrap.osd --"
329
330 # prepare the osd
331 $CEPH_VOLUME lvm prepare --bluestore --data $device_name --no-systemd
332 $CEPH_VOLUME lvm batch --no-auto $device_name --yes --no-systemd
333
334 # osd id and osd fsid
335 $CEPH_VOLUME lvm list --format json $device_name > $TMPDIR/osd.map
336 osd_id=$($SUDO cat $TMPDIR/osd.map | jq -cr '.. | ."ceph.osd_id"? | select(.)')
337 osd_fsid=$($SUDO cat $TMPDIR/osd.map | jq -cr '.. | ."ceph.osd_fsid"? | select(.)')
338
339 # deploy the osd
340 jq --null-input \
341 --arg fsid $FSID \
342 --arg name osd.$osd_id \
343 --arg keyring $TMPDIR/keyring.bootstrap.osd \
344 --arg config "$CONFIG" \
345 --arg osd_fsid $osd_fsid \
346 '{"fsid": $fsid, "name": $name, "params":{"keyring": $keyring, "config": $config, "osd_fsid": $osd_fsid}}' | \
347 $CEPHADM _orch deploy
348 done
349
350 # add node-exporter
351 jq --null-input \
352 --arg fsid $FSID \
353 --arg name node-exporter.a \
354 '{"fsid": $fsid, "name": $name}' | \
355 ${CEPHADM//--image $IMAGE_DEFAULT/} _orch deploy
356 cond="curl 'http://localhost:9100' | grep -q 'Node Exporter'"
357 is_available "node-exporter" "$cond" 10
358
359 # add prometheus
360 jq --null-input \
361 --arg fsid $FSID \
362 --arg name prometheus.a \
363 --argjson config_blobs "$(cat ${CEPHADM_SAMPLES_DIR}/prometheus.json)" \
364 '{"fsid": $fsid, "name": $name, "config_blobs": $config_blobs}' | \
365 ${CEPHADM//--image $IMAGE_DEFAULT/} _orch deploy
366 cond="curl 'localhost:9095/api/v1/query?query=up'"
367 is_available "prometheus" "$cond" 10
368
369 # add grafana
370 jq --null-input \
371 --arg fsid $FSID \
372 --arg name grafana.a \
373 --argjson config_blobs "$(cat ${CEPHADM_SAMPLES_DIR}/grafana.json)" \
374 '{"fsid": $fsid, "name": $name, "config_blobs": $config_blobs}' | \
375 ${CEPHADM//--image $IMAGE_DEFAULT/} _orch deploy
376 cond="curl --insecure 'https://localhost:3000' | grep -q 'grafana'"
377 is_available "grafana" "$cond" 50
378
379 # add nfs-ganesha
380 nfs_stop
381 nfs_rados_pool=$(cat ${CEPHADM_SAMPLES_DIR}/nfs.json | jq -r '.["pool"]')
382 $CEPHADM shell --fsid $FSID --config $CONFIG --keyring $KEYRING -- \
383 ceph osd pool create $nfs_rados_pool 64
384 $CEPHADM shell --fsid $FSID --config $CONFIG --keyring $KEYRING -- \
385 rados --pool nfs-ganesha --namespace nfs-ns create conf-nfs.a
386 $CEPHADM shell --fsid $FSID --config $CONFIG --keyring $KEYRING -- \
387 ceph orch pause
388 jq --null-input \
389 --arg fsid $FSID \
390 --arg name nfs.a \
391 --arg keyring "$KEYRING" \
392 --arg config "$CONFIG" \
393 --argjson config_blobs "$(cat ${CEPHADM_SAMPLES_DIR}/nfs.json)" \
394 '{"fsid": $fsid, "name": $name, "params": {"keyring": $keyring, "config": $config}, "config_blobs": $config_blobs}' | \
395 ${CEPHADM} _orch deploy
396 cond="$SUDO ss -tlnp '( sport = :nfs )' | grep 'ganesha.nfsd'"
397 is_available "nfs" "$cond" 10
398 $CEPHADM shell --fsid $FSID --config $CONFIG --keyring $KEYRING -- \
399 ceph orch resume
400
401 # add alertmanager via custom container
402 alertmanager_image=$(cat ${CEPHADM_SAMPLES_DIR}/custom_container.json | jq -r '.image')
403 tcp_ports=$(jq .ports ${CEPHADM_SAMPLES_DIR}/custom_container.json)
404 jq --null-input \
405 --arg fsid $FSID \
406 --arg name container.alertmanager.a \
407 --arg keyring $TMPDIR/keyring.bootstrap.osd \
408 --arg config "$CONFIG" \
409 --arg image "$alertmanager_image" \
410 --argjson tcp_ports "${tcp_ports}" \
411 --argjson config_blobs "$(cat ${CEPHADM_SAMPLES_DIR}/custom_container.json)" \
412 '{"fsid": $fsid, "name": $name, "image": $image, "params": {"keyring": $keyring, "config": $config, "tcp_ports": $tcp_ports}, "config_blobs": $config_blobs}' | \
413 ${CEPHADM//--image $IMAGE_DEFAULT/} _orch deploy
414 cond="$CEPHADM enter --fsid $FSID --name container.alertmanager.a -- test -f \
415 /etc/alertmanager/alertmanager.yml"
416 is_available "alertmanager.yml" "$cond" 10
417 cond="curl 'http://localhost:9093' | grep -q 'Alertmanager'"
418 is_available "alertmanager" "$cond" 10
419
420 ## run
421 # WRITE ME
422
423 ## unit
424 $CEPHADM unit --fsid $FSID --name mon.a -- is-enabled
425 $CEPHADM unit --fsid $FSID --name mon.a -- is-active
426 expect_false $CEPHADM unit --fsid $FSID --name mon.xyz -- is-active
427 $CEPHADM unit --fsid $FSID --name mon.a -- disable
428 expect_false $CEPHADM unit --fsid $FSID --name mon.a -- is-enabled
429 $CEPHADM unit --fsid $FSID --name mon.a -- enable
430 $CEPHADM unit --fsid $FSID --name mon.a -- is-enabled
431 $CEPHADM unit --fsid $FSID --name mon.a -- status
432 $CEPHADM unit --fsid $FSID --name mon.a -- stop
433 expect_return_code 3 $CEPHADM unit --fsid $FSID --name mon.a -- status
434 $CEPHADM unit --fsid $FSID --name mon.a -- start
435
436 ## shell
437 $CEPHADM shell --fsid $FSID -- true
438 $CEPHADM shell --fsid $FSID -- test -d /var/log/ceph
439 expect_false $CEPHADM --timeout 10 shell --fsid $FSID -- sleep 60
440 $CEPHADM --timeout 60 shell --fsid $FSID -- sleep 10
441 $CEPHADM shell --fsid $FSID --mount $TMPDIR $TMPDIR_TEST_MULTIPLE_MOUNTS -- stat /mnt/$(basename $TMPDIR)
442
443 ## enter
444 expect_false $CEPHADM enter
445 $CEPHADM enter --fsid $FSID --name mon.a -- test -d /var/lib/ceph/mon/ceph-a
446 $CEPHADM enter --fsid $FSID --name mgr.x -- test -d /var/lib/ceph/mgr/ceph-x
447 $CEPHADM enter --fsid $FSID --name mon.a -- pidof ceph-mon
448 expect_false $CEPHADM enter --fsid $FSID --name mgr.x -- pidof ceph-mon
449 $CEPHADM enter --fsid $FSID --name mgr.x -- pidof ceph-mgr
450 # this triggers a bug in older versions of podman, including 18.04's 1.6.2
451 #expect_false $CEPHADM --timeout 5 enter --fsid $FSID --name mon.a -- sleep 30
452 $CEPHADM --timeout 60 enter --fsid $FSID --name mon.a -- sleep 10
453
454 ## ceph-volume
455 $CEPHADM ceph-volume --fsid $FSID -- inventory --format=json \
456 | jq '.[]'
457
458 ## preserve test state
459 [ $CLEANUP = false ] && exit 0
460
461 ## rm-daemon
462 # mon and osd require --force
463 expect_false $CEPHADM rm-daemon --fsid $FSID --name mon.a
464 # mgr does not
465 $CEPHADM rm-daemon --fsid $FSID --name mgr.x
466
467 expect_false $CEPHADM zap-osds --fsid $FSID
468 $CEPHADM zap-osds --fsid $FSID --force
469
470 ## rm-cluster
471 expect_false $CEPHADM rm-cluster --fsid $FSID --zap-osds
472 $CEPHADM rm-cluster --fsid $FSID --force --zap-osds
473
474 echo PASS