]> git.proxmox.com Git - mirror_lxc.git/blob - hooks/nvidia
hooks: change the semantic of NVIDIA_VISIBLE_DEVICES=""
[mirror_lxc.git] / hooks / nvidia
1 #! /bin/bash
2
3 # Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
4
5 set -eu
6
7 # NVIDIA_VISIBLE_DEVICES="" *or* NVIDIA_VISIBLE_DEVICES="void"
8 # GPU support was explicitly disabled, exit early.
9 if [ -z "${NVIDIA_VISIBLE_DEVICES-x}" ] || [ "${NVIDIA_VISIBLE_DEVICES:-}" = "void" ]; then
10 exit 0
11 fi
12
13 # https://github.com/nvidia/nvidia-container-runtime#cuda_version
14 if [ -n "${CUDA_VERSION:-}" ] && [ -z "${NVIDIA_REQUIRE_CUDA:-}" ]; then
15 # Legacy CUDA image: default to all devices and all driver capabilities.
16 if [ -z "${NVIDIA_VISIBLE_DEVICES+x}" ]; then
17 NVIDIA_VISIBLE_DEVICES="all"
18 fi
19 if [ -z "${NVIDIA_DRIVER_CAPABILITIES:-}" ]; then
20 NVIDIA_DRIVER_CAPABILITIES="all"
21 fi
22 if [[ "${CUDA_VERSION}" =~ ^[0-9]+\.[0-9]+ ]]; then
23 NVIDIA_REQUIRE_CUDA="cuda>=${BASH_REMATCH[0]}"
24 fi
25 else
26 # NVIDIA_VISIBLE_DEVICES unset and it's not a legacy CUDA image.
27 # This is not a GPU image, exit early.
28 if [ -z "${NVIDIA_VISIBLE_DEVICES+x}" ]; then
29 exit 0
30 fi
31 fi
32
33 export PATH=$PATH:/usr/sbin:/usr/bin:/sbin:/bin
34 if ! which nvidia-container-cli >/dev/null; then
35 echo "ERROR: Missing tool nvidia-container-cli, see https://github.com/NVIDIA/libnvidia-container" >&2
36 exit 1
37 fi
38
39 in_userns() {
40 [ -e /proc/self/uid_map ] || { echo no; return; }
41 while read line; do
42 fields=$(echo $line | awk '{ print $1 " " $2 " " $3 }')
43 [ "$fields" = "0 0 4294967295" ] && { echo no; return; } || true
44 echo $fields | grep -q " 0 1$" && { echo userns-root; return; } || true
45 done < /proc/self/uid_map
46
47 [ "$(cat /proc/self/uid_map)" = "$(cat /proc/1/uid_map)" ] && \
48 { echo userns-root; return; }
49 echo yes
50 }
51
52 get_ldconfig() {
53 which "ldconfig.real" || which "ldconfig"
54 return $?
55 }
56
57 capability_to_cli() {
58 case "$1" in
59 compute) echo "--compute";;
60 compat32) echo "--compat32";;
61 graphics) echo "--graphics";;
62 utility) echo "--utility";;
63 video) echo "--video";;
64 *) exit 1;;
65 esac
66 return
67 }
68
69 # Same behavior as strconv.ParseBool in golang
70 parse_bool() {
71 case "$1" in
72 1|t|T|TRUE|true|True) echo "true";;
73 0|f|F|FALSE|false|False) echo "false";;
74 *) exit 1;;
75 esac
76 return
77 }
78
79 usage() {
80 cat <<EOF
81 nvidia-container-cli hook for LXC
82
83 Special arguments:
84 [ -h | --help ]: Print this help message and exit.
85
86 Optional arguments:
87 [ --no-load-kmods ]: Do not try to load the NVIDIA kernel modules.
88 [ --disable-require ]: Disable all the constraints of the form NVIDIA_REQUIRE_*.
89 [ --debug <path> ]: The path to the log file.
90 [ --ldconfig <path> ]: The path to the ldconfig binary, use a '@' prefix for a host path.
91 EOF
92 return 0
93 }
94
95 options=$(getopt -o h -l help,no-load-kmods,disable-require,debug:,ldconfig: -- "$@")
96 if [ $? -ne 0 ]; then
97 usage
98 exit 1
99 fi
100 eval set -- "$options"
101
102 CLI_LOAD_KMODS="true"
103 CLI_DISABLE_REQUIRE="false"
104 CLI_DEBUG=
105 CLI_LDCONFIG=
106
107 while :; do
108 case "$1" in
109 --help) usage && exit 1;;
110 --no-load-kmods) CLI_LOAD_KMODS="false"; shift 1;;
111 --disable-require) CLI_DISABLE_REQUIRE="true"; shift 1;;
112 --debug) CLI_DEBUG=$2; shift 2;;
113 --ldconfig) CLI_LDCONFIG=$2; shift 2;;
114 --) shift 1; break;;
115 *) break;;
116 esac
117 done
118
119 HOOK_SECTION=
120 HOOK_TYPE=
121 case "${LXC_HOOK_VERSION:-0}" in
122 0) HOOK_SECTION="${2:-}"; HOOK_TYPE="${3:-}";;
123 1) HOOK_SECTION="${LXC_HOOK_SECTION:-}"; HOOK_TYPE="${LXC_HOOK_TYPE:-}";;
124 *) echo "ERROR: Unsupported hook version: ${LXC_HOOK_VERSION}." >&2; exit 1;;
125 esac
126
127 if [ "${HOOK_SECTION}" != "lxc" ]; then
128 echo "ERROR: Not running through LXC." >&2
129 exit 1
130 fi
131
132 if [ "${HOOK_TYPE}" != "mount" ]; then
133 echo "ERROR: This hook must be used as a \"mount\" hook." >&2
134 exit 1
135 fi
136
137 USERNS=$(in_userns)
138 if [ "${USERNS}" != "yes" ]; then
139 # This is a limitation of libnvidia-container.
140 echo "FIXME: This hook currently only works in unprivileged mode." >&2
141 exit 1
142 fi
143
144 if [ "${USERNS}" = "yes" ]; then
145 CLI_LOAD_KMODS="false"
146 if ! grep -q nvidia_uvm /proc/modules; then
147 echo "WARN: Kernel module nvidia_uvm is not loaded, nvidia-container-cli might fail. Make sure the NVIDIA device driver is installed and loaded." >&2
148 fi
149 fi
150
151 # https://github.com/nvidia/nvidia-container-runtime#nvidia_disable_require
152 if [ -n "${NVIDIA_DISABLE_REQUIRE:-}" ]; then
153 if [ "$(parse_bool "${NVIDIA_DISABLE_REQUIRE}")" = "true" ]; then
154 CLI_DISABLE_REQUIRE="true"
155 fi
156 fi
157
158 if [ -z "${CLI_DEBUG}" ]; then
159 if [ "${LXC_LOG_LEVEL}" = "DEBUG" ] || [ "${LXC_LOG_LEVEL}" = "TRACE" ]; then
160 rootfs_path="${LXC_ROOTFS_PATH#*:}"
161 hookdir="${rootfs_path/%rootfs/hook}"
162 if mkdir -p "${hookdir}"; then
163 CLI_DEBUG="${hookdir}/nvidia.log"
164 fi
165 fi
166 fi
167
168 # A '@' prefix means a host path.
169 if [ -z "${CLI_LDCONFIG}" ]; then
170 if host_ldconfig=$(get_ldconfig); then
171 CLI_LDCONFIG="@${host_ldconfig}"
172 fi
173 fi
174
175 # https://github.com/nvidia/nvidia-container-runtime#nvidia_visible_devices
176 CLI_DEVICES="${NVIDIA_VISIBLE_DEVICES}"
177
178 # https://github.com/nvidia/nvidia-container-runtime#nvidia_driver_capabilities
179 CLI_CAPABILITIES=
180 if [ -n "${NVIDIA_DRIVER_CAPABILITIES:-}" ]; then
181 CLI_CAPABILITIES="${NVIDIA_DRIVER_CAPABILITIES//,/ }"
182 fi
183
184 # https://github.com/nvidia/nvidia-container-runtime#nvidia_require_
185 CLI_REQUIREMENTS=
186 for req in $(compgen -e "NVIDIA_REQUIRE_"); do
187 CLI_REQUIREMENTS="${CLI_REQUIREMENTS} ${!req}"
188 done
189
190 if [ "${CLI_CAPABILITIES}" = "all" ]; then
191 CLI_CAPABILITIES="compute compat32 graphics utility video"
192 fi
193
194 if [ -z "${CLI_CAPABILITIES}" ]; then
195 CLI_CAPABILITIES="utility"
196 fi
197
198 global_args=("")
199 configure_args=("")
200
201 if [ -n "${CLI_DEBUG}" ]; then
202 echo "INFO: Writing nvidia-container-cli log at ${CLI_DEBUG}." >&2
203 global_args+=("--debug=${CLI_DEBUG}")
204 fi
205
206 if [ "${CLI_LOAD_KMODS}" = "true" ]; then
207 global_args+=(--load-kmods)
208 fi
209
210 if [ "${USERNS}" = "yes" ]; then
211 global_args+=(--user)
212 configure_args+=(--no-cgroups)
213 fi
214
215 if [ -n "${CLI_LDCONFIG}" ]; then
216 configure_args+=(--ldconfig="${CLI_LDCONFIG}")
217 fi
218
219 if [ -n "${CLI_DEVICES}" ] && [ "${CLI_DEVICES}" != "none" ]; then
220 configure_args+=(--device="${CLI_DEVICES}")
221 fi
222
223 for cap in ${CLI_CAPABILITIES}; do
224 if arg=$(capability_to_cli "${cap}"); then
225 configure_args+=("${arg}")
226 else
227 echo "ERROR: Unknown driver capability \"${cap}\"." >&2
228 exit 1
229 fi
230 done
231
232 if [ "${CLI_DISABLE_REQUIRE}" = "false" ]; then
233 for req in ${CLI_REQUIREMENTS}; do
234 configure_args+=(--require="${req}")
235 done
236 fi
237
238 set -x
239 exec nvidia-container-cli ${global_args[@]} configure ${configure_args[@]} "${LXC_ROOTFS_MOUNT}"