]> git.proxmox.com Git - ceph.git/blobdiff - ceph/src/spdk/scripts/setup.sh
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / spdk / scripts / setup.sh
index e80e237732eb7f49350a3c7d82ab819f32d8294d..22e726de52101bc56e0fa366716ee8eff9401e66 100755 (executable)
@@ -3,12 +3,85 @@
 set -e
 
 rootdir=$(readlink -f $(dirname $0))/..
+source "$rootdir/scripts/common.sh"
+
+function usage()
+{
+       if [ `uname` = Linux ]; then
+               options="[config|reset|status|cleanup|help]"
+       else
+               options="[config|reset|help]"
+       fi
+
+       [[ ! -z $2 ]] && ( echo "$2"; echo ""; )
+       echo "Helper script for allocating hugepages and binding NVMe, I/OAT and Virtio devices to"
+       echo "a generic VFIO kernel driver. If VFIO is not available on the system, this script will"
+       echo "fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored."
+       echo "All hugepage operations use default hugepage size on the system (hugepagesz)."
+       echo "Usage: $(basename $1) $options"
+       echo
+       echo "$options - as following:"
+       echo "config            Default mode. Allocate hugepages and bind PCI devices."
+       if [ `uname` = Linux ]; then
+               echo "cleanup            Remove any orphaned files that can be left in the system after SPDK application exit"
+       fi
+       echo "reset             Rebind PCI devices back to their original drivers."
+       echo "                  Also cleanup any leftover spdk files/resources."
+       echo "                  Hugepage memory size will remain unchanged."
+       if [ `uname` = Linux ]; then
+               echo "status            Print status of all SPDK-compatible devices on the system."
+       fi
+       echo "help              Print this help message."
+       echo
+       echo "The following environment variables can be specified."
+       echo "HUGEMEM           Size of hugepage memory to allocate (in MB). 2048 by default."
+       echo "                  For NUMA systems, the hugepages will be evenly distributed"
+       echo "                  between CPU nodes"
+       echo "NRHUGE            Number of hugepages to allocate. This variable overwrites HUGEMEM."
+       echo "HUGENODE          Specific NUMA node to allocate hugepages on. To allocate"
+       echo "                  hugepages on multiple nodes run this script multiple times -"
+       echo "                  once for each node."
+       echo "PCI_WHITELIST     Whitespace separated list of PCI devices (NVMe, I/OAT, Virtio) to bind."
+       echo "                  Each device must be specified as a full PCI address."
+       echo "                  E.g. PCI_WHITELIST=\"0000:01:00.0 0000:02:00.0\""
+       echo "                  To blacklist all PCI devices use a non-valid address."
+       echo "                  E.g. PCI_WHITELIST=\"none\""
+       echo "                  If empty or unset, all PCI devices will be bound."
+       echo "TARGET_USER       User that will own hugepage mountpoint directory and vfio groups."
+       echo "                  By default the current user will be used."
+       exit 0
+}
+
+# In monolithic kernels the lsmod won't work. So
+# back that with a /sys/modules check. Return a different code for
+# built-in vs module just in case we want that down the road.
+function check_for_driver {
+       $(lsmod | grep $1 > /dev/null)
+       if [ $? -eq 0 ]; then
+               return 1
+       else
+               if [[ -d /sys/module/$1 ]]; then
+                       return 2
+               else
+                       return 0
+               fi
+       fi
+       return 0
+}
+
+function pci_can_bind() {
+       if [[ ${#PCI_WHITELIST[@]} == 0 ]]; then
+               #no whitelist specified, bind all devices
+               return 1
+       fi
 
-function linux_iter_pci {
-       # Argument is the class code
-       # TODO: More specifically match against only class codes in the grep
-       # step.
-       lspci -mm -n | grep $1 | tr -d '"' | awk -F " " '{print "0000:"$1}'
+       for i in ${PCI_WHITELIST[@]}
+       do
+               if [ "$i" == "$1" ] ; then
+                        return 1
+               fi
+       done
+       return 0
 }
 
 function linux_bind_driver() {
@@ -35,17 +108,62 @@ function linux_bind_driver() {
 
        iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group))
        if [ -e "/dev/vfio/$iommu_group" ]; then
-               if [ "$username" != "" ]; then
-                       chown "$username" "/dev/vfio/$iommu_group"
+               if [ -n "$TARGET_USER" ]; then
+                       chown "$TARGET_USER" "/dev/vfio/$iommu_group"
                fi
        fi
 }
 
-function linux_hugetlbfs_mount() {
-       mount | grep '^hugetlbfs ' | awk '{ print $3 }'
+function linux_unbind_driver() {
+       bdf="$1"
+       ven_dev_id=$(lspci -n -s $bdf | cut -d' ' -f3 | sed 's/:/ /')
+
+       if ! [ -e "/sys/bus/pci/devices/$bdf/driver" ]; then
+               return 0
+       fi
+
+       old_driver_name=$(basename $(readlink /sys/bus/pci/devices/$bdf/driver))
+
+       echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true
+       echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
+       echo "$bdf ($ven_dev_id): $old_driver_name -> no driver"
+}
+
+function linux_hugetlbfs_mounts() {
+       mount | grep ' type hugetlbfs ' | awk '{ print $3 }'
 }
 
-function configure_linux {
+function get_nvme_name_from_bdf {
+       set +e
+       nvme_devs=`lsblk -d --output NAME | grep "^nvme"`
+       set -e
+       for dev in $nvme_devs; do
+               link_name=$(readlink /sys/block/$dev/device/device) || true
+               if [ -z "$link_name" ]; then
+                       link_name=$(readlink /sys/block/$dev/device)
+               fi
+               link_bdf=$(basename "$link_name")
+               if [ "$link_bdf" = "$1" ]; then
+                       eval "$2=$dev"
+                       return
+               fi
+       done
+}
+
+function get_virtio_names_from_bdf {
+       blk_devs=`lsblk --nodeps --output NAME`
+       virtio_names=''
+
+       for dev in $blk_devs; do
+               if readlink "/sys/block/$dev" | grep -q "$1"; then
+                       virtio_names="$virtio_names $dev"
+               fi
+       done
+
+       eval "$2='$virtio_names'"
+}
+
+function configure_linux_pci {
        driver_name=vfio-pci
        if [ -z "$(ls /sys/kernel/iommu_groups)" ]; then
                # No IOMMU. Use uio.
@@ -54,11 +172,25 @@ function configure_linux {
 
        # NVMe
        modprobe $driver_name || true
-       for bdf in $(linux_iter_pci 0108); do
-               linux_bind_driver "$bdf" "$driver_name"
+       for bdf in $(iter_pci_class_code 01 08 02); do
+               blkname=''
+               get_nvme_name_from_bdf "$bdf" blkname
+               if pci_can_bind $bdf == "0" ; then
+                       echo "Skipping un-whitelisted NVMe controller $blkname ($bdf)"
+                       continue
+               fi
+               if [ "$blkname" != "" ]; then
+                       mountpoints=$(lsblk /dev/$blkname --output MOUNTPOINT -n | wc -w)
+               else
+                       mountpoints="0"
+               fi
+               if [ "$mountpoints" = "0" ]; then
+                       linux_bind_driver "$bdf" "$driver_name"
+               else
+                       echo Active mountpoints on /dev/$blkname, so not binding PCI dev $bdf
+               fi
        done
 
-
        # IOAT
        TMP=`mktemp`
        #collect all the device_id info of ioat devices.
@@ -66,28 +198,132 @@ function configure_linux {
        | awk -F"x" '{print $2}' > $TMP
 
        for dev_id in `cat $TMP`; do
-               # Abuse linux_iter_pci by giving it a device ID instead of a class code
-               for bdf in $(linux_iter_pci $dev_id); do
+               for bdf in $(iter_pci_dev_id 8086 $dev_id); do
+                       if pci_can_bind $bdf == "0" ; then
+                               echo "Skipping un-whitelisted I/OAT device at $bdf"
+                               continue
+                       fi
+                       linux_bind_driver "$bdf" "$driver_name"
+               done
+       done
+       rm $TMP
+
+       # virtio
+       TMP=`mktemp`
+       #collect all the device_id info of virtio devices.
+       grep "PCI_DEVICE_ID_VIRTIO" $rootdir/include/spdk/pci_ids.h \
+       | awk -F"x" '{print $2}' > $TMP
+
+       for dev_id in `cat $TMP`; do
+               for bdf in $(iter_pci_dev_id 1af4 $dev_id); do
+                       if pci_can_bind $bdf == "0" ; then
+                               echo "Skipping un-whitelisted Virtio device at $bdf"
+                               continue
+                       fi
+                       blknames=''
+                       get_virtio_names_from_bdf "$bdf" blknames
+                       for blkname in $blknames; do
+                               if mount | grep -q "/dev/$blkname"; then
+                                       echo Active mountpoints on /dev/$blkname, so not binding PCI dev $bdf
+                                       continue 2
+                               fi
+                       done
+
                        linux_bind_driver "$bdf" "$driver_name"
                done
        done
        rm $TMP
 
        echo "1" > "/sys/bus/pci/rescan"
+}
+
+function cleanup_linux {
+       shopt -s extglob nullglob
+       dirs_to_clean=""
+       dirs_to_clean="$(echo {/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) "
+       if [[ -d $XDG_RUNTIME_DIR && $XDG_RUNTIME_DIR != *" "* ]]; then
+               dirs_to_clean+="$(readlink -e assert_not_empty $XDG_RUNTIME_DIR/dpdk/spdk{,_pid}+([0-9]) || true) "
+       fi
+
+       files_to_clean=""
+       for dir in $dirs_to_clean; do
+               files_to_clean+="$(echo $dir/*) "
+       done
+       shopt -u extglob nullglob
+
+       files_to_clean+="$(echo /dev/shm/* | egrep '(spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevtest|bdevperf)_trace|spdk_iscsi_conns' || true) "
+       files_to_clean="$(readlink -e assert_not_empty $files_to_clean || true)"
+       if [[ -z "$files_to_clean" ]]; then
+               echo "Clean"
+               return 0;
+       fi
+
+       shopt -s extglob
+       for fd_dir in $(echo /proc/+([0-9])); do
+               opened_files+="$(readlink -e assert_not_empty $fd_dir/fd/* || true)"
+       done
+       shopt -u extglob
+
+       if [[ -z "$opened_files" ]]; then
+               echo "Can't get list of opened files!"
+               exit 1
+       fi
+
+       echo 'Cleaning'
+       for f in $files_to_clean; do
+               if ! echo "$opened_files" | egrep -q "^$f\$"; then
+                       echo "Removing:    $f"
+                       rm $f
+               else
+                       echo "Still open: $f"
+               fi
+       done
+
+       for dir in $dirs_to_clean; do
+       if ! echo "$opened_files" | egrep -q "^$dir\$"; then
+               echo "Removing:    $dir"
+               rmdir $dir
+       else
+               echo "Still open: $dir"
+       fi
+       done
+       echo "Clean"
 
-       hugetlbfs_mount=$(linux_hugetlbfs_mount)
+       unset dirs_to_clean files_to_clean opened_files
+}
+
+function configure_linux {
+       configure_linux_pci
+       hugetlbfs_mounts=$(linux_hugetlbfs_mounts)
+
+       if [ -z "$hugetlbfs_mounts" ]; then
+               hugetlbfs_mounts=/mnt/huge
+               echo "Mounting hugetlbfs at $hugetlbfs_mounts"
+               mkdir -p "$hugetlbfs_mounts"
+               mount -t hugetlbfs nodev "$hugetlbfs_mounts"
+       fi
 
-       if [ -z "$hugetlbfs_mount" ]; then
-               hugetlbfs_mount=/mnt/huge
-               echo "Mounting hugetlbfs at $hugetlbfs_mount"
-               mkdir -p "$hugetlbfs_mount"
-               mount -t hugetlbfs nodev "$hugetlbfs_mount"
+       if [ -z "$HUGENODE" ]; then
+               hugepages_target="/proc/sys/vm/nr_hugepages"
+       else
+               hugepages_target="/sys/devices/system/node/node${HUGENODE}/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages"
+       fi
+
+       echo "$NRHUGE" > "$hugepages_target"
+       allocated_hugepages=`cat $hugepages_target`
+       if [ "$allocated_hugepages" -lt "$NRHUGE" ]; then
+               echo ""
+               echo "## ERROR: requested $NRHUGE hugepages but only $allocated_hugepages could be allocated."
+               echo "## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine."
+               exit 1
        fi
-       echo "$NRHUGE" > /proc/sys/vm/nr_hugepages
 
        if [ "$driver_name" = "vfio-pci" ]; then
-               if [ "$username" != "" ]; then
-                       chown "$username" "$hugetlbfs_mount"
+               if [ -n "$TARGET_USER" ]; then
+                       for mount in $hugetlbfs_mounts; do
+                               chown "$TARGET_USER" "$mount"
+                               chmod g+w "$mount"
+                       done
                fi
 
                MEMLOCK_AMNT=`ulimit -l`
@@ -111,43 +347,119 @@ function configure_linux {
        fi
 }
 
-function reset_linux {
+function reset_linux_pci {
        # NVMe
-       modprobe nvme || true
-       for bdf in $(linux_iter_pci 0108); do
-               linux_bind_driver "$bdf" nvme
+       set +e
+       check_for_driver nvme
+       driver_loaded=$?
+       set -e
+       for bdf in $(iter_pci_class_code 01 08 02); do
+               if pci_can_bind $bdf == "0" ; then
+                       echo "Skipping un-whitelisted NVMe controller $blkname ($bdf)"
+                       continue
+               fi
+               if [ $driver_loaded -ne 0 ]; then
+                       linux_bind_driver "$bdf" nvme
+               else
+                       linux_unbind_driver "$bdf"
+               fi
        done
 
-
        # IOAT
        TMP=`mktemp`
        #collect all the device_id info of ioat devices.
        grep "PCI_DEVICE_ID_INTEL_IOAT" $rootdir/include/spdk/pci_ids.h \
        | awk -F"x" '{print $2}' > $TMP
 
-       modprobe ioatdma || true
+       set +e
+       check_for_driver ioatdma
+       driver_loaded=$?
+       set -e
+       for dev_id in `cat $TMP`; do
+               for bdf in $(iter_pci_dev_id 8086 $dev_id); do
+                       if pci_can_bind $bdf == "0" ; then
+                               echo "Skipping un-whitelisted I/OAT device at $bdf"
+                               continue
+                       fi
+                       if [ $driver_loaded -ne 0 ]; then
+                               linux_bind_driver "$bdf" ioatdma
+                       else
+                               linux_unbind_driver "$bdf"
+                       fi
+               done
+       done
+       rm $TMP
+
+       # virtio
+       TMP=`mktemp`
+       #collect all the device_id info of virtio devices.
+       grep "PCI_DEVICE_ID_VIRTIO" $rootdir/include/spdk/pci_ids.h \
+       | awk -F"x" '{print $2}' > $TMP
+
+       # TODO: check if virtio-pci is loaded first and just unbind if it is not loaded
+       # Requires some more investigation - for example, some kernels do not seem to have
+       #  virtio-pci but just virtio_scsi instead.  Also need to make sure we get the
+       #  underscore vs. dash right in the virtio_scsi name.
+       modprobe virtio-pci || true
        for dev_id in `cat $TMP`; do
-               # Abuse linux_iter_pci by giving it a device ID instead of a class code
-               for bdf in $(linux_iter_pci $dev_id); do
-                       linux_bind_driver "$bdf" ioatdma
+               for bdf in $(iter_pci_dev_id 1af4 $dev_id); do
+                       if pci_can_bind $bdf == "0" ; then
+                               echo "Skipping un-whitelisted Virtio device at $bdf"
+                               continue
+                       fi
+                       linux_bind_driver "$bdf" virtio-pci
                done
        done
        rm $TMP
 
        echo "1" > "/sys/bus/pci/rescan"
+}
 
-       hugetlbfs_mount=$(linux_hugetlbfs_mount)
-       rm -f "$hugetlbfs_mount"/spdk*map_*
+function reset_linux {
+       reset_linux_pci
+       for mount in $(linux_hugetlbfs_mounts); do
+               rm -f "$mount"/spdk*map_*
+       done
+       rm -f /run/.spdk*
 }
 
 function status_linux {
+       echo "Hugepages"
+       printf "%-6s %10s %8s / %6s\n" "node" "hugesize"  "free" "total"
+
+       numa_nodes=0
+       shopt -s nullglob
+       for path in /sys/devices/system/node/node?/hugepages/hugepages-*/; do
+               numa_nodes=$((numa_nodes + 1))
+               free_pages=`cat $path/free_hugepages`
+               all_pages=`cat $path/nr_hugepages`
+
+               [[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]]
+
+               node=${BASH_REMATCH[1]}
+               huge_size=${BASH_REMATCH[2]}
+
+               printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
+       done
+       shopt -u nullglob
+
+       # fall back to system-wide hugepages
+       if [ "$numa_nodes" = "0" ]; then
+               free_pages=`grep HugePages_Free /proc/meminfo | awk '{ print $2 }'`
+               all_pages=`grep HugePages_Total /proc/meminfo | awk '{ print $2 }'`
+               node="-"
+               huge_size="$HUGEPGSZ"
+
+               printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
+       fi
+
        echo "NVMe devices"
 
        echo -e "BDF\t\tNuma Node\tDriver name\t\tDevice name"
-       for bdf in $(linux_iter_pci 0108); do
+       for bdf in $(iter_pci_class_code 01 08 02); do
                driver=`grep DRIVER /sys/bus/pci/devices/$bdf/uevent |awk -F"=" '{print $2}'`
                node=`cat /sys/bus/pci/devices/$bdf/numa_node`;
-               if [ "$driver" = "nvme" ]; then
+               if [ "$driver" = "nvme" -a -d /sys/bus/pci/devices/$bdf/nvme ]; then
                        name="\t"`ls /sys/bus/pci/devices/$bdf/nvme`;
                else
                        name="-";
@@ -162,16 +474,31 @@ function status_linux {
        | awk -F"x" '{print $2}'`
        echo -e "BDF\t\tNuma Node\tDriver Name"
        for dev_id in $TMP; do
-               # Abuse linux_iter_pci by giving it a device ID instead of a class code
-               for bdf in $(linux_iter_pci $dev_id); do
+               for bdf in $(iter_pci_dev_id 8086 $dev_id); do
                        driver=`grep DRIVER /sys/bus/pci/devices/$bdf/uevent |awk -F"=" '{print $2}'`
                        node=`cat /sys/bus/pci/devices/$bdf/numa_node`;
                        echo -e "$bdf\t$node\t\t$driver"
                done
        done
+
+       echo "virtio"
+
+       #collect all the device_id info of virtio devices.
+       TMP=`grep "PCI_DEVICE_ID_VIRTIO" $rootdir/include/spdk/pci_ids.h \
+       | awk -F"x" '{print $2}'`
+       echo -e "BDF\t\tNuma Node\tDriver Name\t\tDevice Name"
+       for dev_id in $TMP; do
+               for bdf in $(iter_pci_dev_id 1af4 $dev_id); do
+                       driver=`grep DRIVER /sys/bus/pci/devices/$bdf/uevent |awk -F"=" '{print $2}'`
+                       node=`cat /sys/bus/pci/devices/$bdf/numa_node`;
+                       blknames=''
+                       get_virtio_names_from_bdf "$bdf" blknames
+                       echo -e "$bdf\t$node\t\t$driver\t\t$blknames"
+               done
+       done
 }
 
-function configure_freebsd {
+function configure_freebsd_pci {
        TMP=`mktemp`
 
        # NVMe
@@ -193,11 +520,22 @@ function configure_freebsd {
        kenv hw.nic_uio.bdfs=$BDFS
        kldload nic_uio.ko
        rm $TMP
+}
 
-       kldunload contigmem.ko || true
-       kenv hw.contigmem.num_buffers=$((NRHUGE * 2 / 256))
-       kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024))
-       kldload contigmem.ko
+function configure_freebsd {
+       configure_freebsd_pci
+       # If contigmem is already loaded but the HUGEMEM specified doesn't match the
+       #  previous value, unload contigmem so that we can reload with the new value.
+       if kldstat -q -m contigmem; then
+               if [ `kenv hw.contigmem.num_buffers` -ne "$((HUGEMEM / 256))" ]; then
+                       kldunload contigmem.ko
+               fi
+       fi
+       if ! kldstat -q -m contigmem; then
+               kenv hw.contigmem.num_buffers=$((HUGEMEM / 256))
+               kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024))
+               kldload contigmem.ko
+       fi
 }
 
 function reset_freebsd {
@@ -205,39 +543,62 @@ function reset_freebsd {
        kldunload nic_uio.ko || true
 }
 
-: ${NRHUGE:=1024}
+mode=$1
+
+if [ -z "$mode" ]; then
+       mode="config"
+fi
 
-username=$1
-mode=$2
+: ${HUGEMEM:=2048}
+: ${PCI_WHITELIST:=""}
 
-if [ "$username" = "reset" -o "$username" = "config" -o "$username" = "status" ]; then
-       mode="$username"
-       username=""
+if [ -n "$NVME_WHITELIST" ]; then
+       PCI_WHITELIST="$PCI_WHITELIST $NVME_WHITELIST"
 fi
 
-if [ "$mode" == "" ]; then
-       mode="config"
+if [ -n "$SKIP_PCI" ]; then
+       PCI_WHITELIST="none"
 fi
 
-if [ "$username" = "" ]; then
-       username="$SUDO_USER"
-       if [ "$username" = "" ]; then
-               username=`logname 2>/dev/null` || true
+declare -a PCI_WHITELIST=(${PCI_WHITELIST})
+
+if [ -z "$TARGET_USER" ]; then
+       TARGET_USER="$SUDO_USER"
+       if [ -z "$TARGET_USER" ]; then
+               TARGET_USER=`logname 2>/dev/null` || true
        fi
 fi
 
 if [ `uname` = Linux ]; then
+       HUGEPGSZ=$(( `grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9'` ))
+       HUGEPGSZ_MB=$(( $HUGEPGSZ / 1024 ))
+       : ${NRHUGE=$(( (HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB ))}
+
        if [ "$mode" == "config" ]; then
                configure_linux
+       elif [ "$mode" == "cleanup" ]; then
+               cleanup_linux
        elif [ "$mode" == "reset" ]; then
                reset_linux
        elif [ "$mode" == "status" ]; then
                status_linux
+       elif [ "$mode" == "help" ]; then
+               usage $0
+       else
+               usage $0 "Invalid argument '$mode'"
        fi
 else
        if [ "$mode" == "config" ]; then
                configure_freebsd
        elif [ "$mode" == "reset" ]; then
                reset_freebsd
+       elif [ "$mode" == "cleanup" ]; then
+               echo "setup.sh cleanup function not yet supported on $(uname)"
+       elif [ "$mode" == "status" ]; then
+               echo "setup.sh status function not yet supported on $(uname)"
+       elif [ "$mode" == "help" ]; then
+               usage $0
+       else
+               usage $0 "Invalid argument '$mode'"
        fi
 fi