ceph/qa/workunits/rados/test_crash.sh

   1 #!/bin/sh
   2
   3 set -x
   4
   5 # run on a single-node three-OSD cluster
   6
   7 sudo killall -ABRT ceph-osd
   8 sleep 5
   9
  10 # kill caused coredumps; find them and delete them, carefully, so as
  11 # not to disturb other coredumps, or else teuthology will see them
  12 # and assume test failure.  sudos are because the core files are
  13 # root/600
  14 for f in $(find $TESTDIR/archive/coredump -type f); do
  15         gdb_output=$(echo "quit" | sudo gdb /usr/bin/ceph-osd $f)
  16         if expr match "$gdb_output" ".*generated.*ceph-osd.*" && \
  17            ( \
  18
  19                 expr match "$gdb_output" ".*terminated.*signal 6.*" || \
  20                 expr match "$gdb_output" ".*terminated.*signal SIGABRT.*" \
  21            )
  22         then
  23                 sudo rm $f
  24         fi
  25 done
  26
  27 # let daemon find crashdumps on startup
  28 sudo systemctl restart ceph-crash
  29 sleep 30
  30
  31 # must be 3 crashdumps registered and moved to crash/posted
  32 [ $(ceph crash ls | wc -l) = 4 ]  || exit 1   # 4 here bc of the table header
  33 [ $(sudo find /var/lib/ceph/crash/posted/ -name meta | wc -l) = 3 ] || exit 1
  34
  35 # there should be a health warning
  36 ceph health detail | grep RECENT_CRASH || exit 1
  37 ceph crash archive-all
  38 sleep 30
  39 ceph health detail | grep -c RECENT_CRASH | grep 0     # should be gone!