]> git.proxmox.com Git - mirror_lxc.git/blob - hooks/nvidia
Merge pull request #2496 from flx42/nvidia-hook-lgpl
[mirror_lxc.git] / hooks / nvidia
1 #! /bin/bash
2 #
3 # Copyright (c) 2017, 2018 NVIDIA CORPORATION.
4 #
5 # This library is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU Lesser General Public
7 # License as published by the Free Software Foundation; either
8 # version 2.1 of the License, or (at your option) any later version.
9 #
10 # This library is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 # Lesser General Public License for more details.
14 #
15 # You should have received a copy of the GNU Lesser General Public
16 # License along with this library; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
19 set -eu
20
21 # NVIDIA_VISIBLE_DEVICES="" *or* NVIDIA_VISIBLE_DEVICES="void"
22 # GPU support was explicitly disabled, exit early.
23 if [ -z "${NVIDIA_VISIBLE_DEVICES-x}" ] || [ "${NVIDIA_VISIBLE_DEVICES:-}" = "void" ]; then
24 exit 0
25 fi
26
27 # https://github.com/nvidia/nvidia-container-runtime#cuda_version
28 if [ -n "${CUDA_VERSION:-}" ] && [ -z "${NVIDIA_REQUIRE_CUDA:-}" ]; then
29 # Legacy CUDA image: default to all devices and all driver capabilities.
30 if [ -z "${NVIDIA_VISIBLE_DEVICES+x}" ]; then
31 NVIDIA_VISIBLE_DEVICES="all"
32 fi
33 if [ -z "${NVIDIA_DRIVER_CAPABILITIES:-}" ]; then
34 NVIDIA_DRIVER_CAPABILITIES="all"
35 fi
36 if [[ "${CUDA_VERSION}" =~ ^[0-9]+\.[0-9]+ ]]; then
37 NVIDIA_REQUIRE_CUDA="cuda>=${BASH_REMATCH[0]}"
38 fi
39 else
40 # NVIDIA_VISIBLE_DEVICES unset and it's not a legacy CUDA image.
41 # This is not a GPU image, exit early.
42 if [ -z "${NVIDIA_VISIBLE_DEVICES+x}" ]; then
43 exit 0
44 fi
45 fi
46
47 export PATH=$PATH:/usr/sbin:/usr/bin:/sbin:/bin
48 if ! which nvidia-container-cli >/dev/null; then
49 echo "ERROR: Missing tool nvidia-container-cli, see https://github.com/NVIDIA/libnvidia-container" >&2
50 exit 1
51 fi
52
53 in_userns() {
54 [ -e /proc/self/uid_map ] || { echo no; return; }
55 while read line; do
56 fields=$(echo $line | awk '{ print $1 " " $2 " " $3 }')
57 [ "$fields" = "0 0 4294967295" ] && { echo no; return; } || true
58 echo $fields | grep -q " 0 1$" && { echo userns-root; return; } || true
59 done < /proc/self/uid_map
60
61 [ "$(cat /proc/self/uid_map)" = "$(cat /proc/1/uid_map)" ] && \
62 { echo userns-root; return; }
63 echo yes
64 }
65
66 get_ldconfig() {
67 which "ldconfig.real" || which "ldconfig"
68 return $?
69 }
70
71 capability_to_cli() {
72 case "$1" in
73 compute) echo "--compute";;
74 compat32) echo "--compat32";;
75 display) echo "--display";;
76 graphics) echo "--graphics";;
77 utility) echo "--utility";;
78 video) echo "--video";;
79 *) exit 1;;
80 esac
81 return
82 }
83
84 # Same behavior as strconv.ParseBool in golang
85 parse_bool() {
86 case "$1" in
87 1|t|T|TRUE|true|True) echo "true";;
88 0|f|F|FALSE|false|False) echo "false";;
89 *) exit 1;;
90 esac
91 return
92 }
93
94 usage() {
95 cat <<EOF
96 nvidia-container-cli hook for LXC
97
98 Special arguments:
99 [ -h | --help ]: Print this help message and exit.
100
101 Optional arguments:
102 [ --no-load-kmods ]: Do not try to load the NVIDIA kernel modules.
103 [ --disable-require ]: Disable all the constraints of the form NVIDIA_REQUIRE_*.
104 [ --debug <path> ]: The path to the log file.
105 [ --ldcache <path> ]: The path to the host system's DSO cache.
106 [ --root <path> ]: The path to the driver root directory.
107 [ --ldconfig <path> ]: The path to the ldconfig binary, use a '@' prefix for a host path.
108 EOF
109 return 0
110 }
111
112 options=$(getopt -o h -l help,no-load-kmods,disable-require,debug:,ldcache:,root:,ldconfig: -- "$@")
113 if [ $? -ne 0 ]; then
114 usage
115 exit 1
116 fi
117 eval set -- "$options"
118
119 CLI_LOAD_KMODS="true"
120 CLI_DISABLE_REQUIRE="false"
121 CLI_DEBUG=
122 CLI_LDCACHE=
123 CLI_ROOT=
124 CLI_LDCONFIG=
125
126 while :; do
127 case "$1" in
128 --help) usage && exit 1;;
129 --no-load-kmods) CLI_LOAD_KMODS="false"; shift 1;;
130 --disable-require) CLI_DISABLE_REQUIRE="true"; shift 1;;
131 --debug) CLI_DEBUG=$2; shift 2;;
132 --ldcache) CLI_LDCACHE=$2; shift 2;;
133 --root) CLI_ROOT=$2; shift 2;;
134 --ldconfig) CLI_LDCONFIG=$2; shift 2;;
135 --) shift 1; break;;
136 *) break;;
137 esac
138 done
139
140 HOOK_SECTION=
141 HOOK_TYPE=
142 case "${LXC_HOOK_VERSION:-0}" in
143 0) HOOK_SECTION="${2:-}"; HOOK_TYPE="${3:-}";;
144 1) HOOK_SECTION="${LXC_HOOK_SECTION:-}"; HOOK_TYPE="${LXC_HOOK_TYPE:-}";;
145 *) echo "ERROR: Unsupported hook version: ${LXC_HOOK_VERSION}." >&2; exit 1;;
146 esac
147
148 if [ "${HOOK_SECTION}" != "lxc" ]; then
149 echo "ERROR: Not running through LXC." >&2
150 exit 1
151 fi
152
153 if [ "${HOOK_TYPE}" != "mount" ]; then
154 echo "ERROR: This hook must be used as a \"mount\" hook." >&2
155 exit 1
156 fi
157
158 USERNS=$(in_userns)
159 if [ "${USERNS}" != "yes" ]; then
160 # This is a limitation of libnvidia-container.
161 echo "FIXME: This hook currently only works in unprivileged mode." >&2
162 exit 1
163 fi
164
165 if [ "${USERNS}" = "yes" ]; then
166 CLI_LOAD_KMODS="false"
167 if ! grep -q nvidia_uvm /proc/modules; then
168 echo "WARN: Kernel module nvidia_uvm is not loaded, nvidia-container-cli might fail. Make sure the NVIDIA device driver is installed and loaded." >&2
169 fi
170 fi
171
172 # https://github.com/nvidia/nvidia-container-runtime#nvidia_disable_require
173 if [ -n "${NVIDIA_DISABLE_REQUIRE:-}" ]; then
174 if [ "$(parse_bool "${NVIDIA_DISABLE_REQUIRE}")" = "true" ]; then
175 CLI_DISABLE_REQUIRE="true"
176 fi
177 fi
178
179 if [ -z "${CLI_DEBUG}" ]; then
180 if [ "${LXC_LOG_LEVEL}" = "DEBUG" ] || [ "${LXC_LOG_LEVEL}" = "TRACE" ]; then
181 rootfs_path="${LXC_ROOTFS_PATH#*:}"
182 hookdir="${rootfs_path/%rootfs/hook}"
183 if mkdir -p "${hookdir}"; then
184 CLI_DEBUG="${hookdir}/nvidia.log"
185 fi
186 fi
187 fi
188
189 # A '@' prefix means a host path.
190 if [ -z "${CLI_LDCONFIG}" ]; then
191 if host_ldconfig=$(get_ldconfig); then
192 CLI_LDCONFIG="@${host_ldconfig}"
193 fi
194 fi
195
196 # https://github.com/nvidia/nvidia-container-runtime#nvidia_visible_devices
197 CLI_DEVICES="${NVIDIA_VISIBLE_DEVICES}"
198
199 # https://github.com/nvidia/nvidia-container-runtime#nvidia_driver_capabilities
200 CLI_CAPABILITIES=
201 if [ -n "${NVIDIA_DRIVER_CAPABILITIES:-}" ]; then
202 CLI_CAPABILITIES="${NVIDIA_DRIVER_CAPABILITIES//,/ }"
203 fi
204
205 # https://github.com/nvidia/nvidia-container-runtime#nvidia_require_
206 CLI_REQUIREMENTS=
207 for req in $(compgen -e "NVIDIA_REQUIRE_"); do
208 CLI_REQUIREMENTS="${CLI_REQUIREMENTS} ${!req}"
209 done
210
211 if [ "${CLI_CAPABILITIES}" = "all" ]; then
212 CLI_CAPABILITIES="compute compat32 display graphics utility video"
213 fi
214
215 if [ -z "${CLI_CAPABILITIES}" ]; then
216 CLI_CAPABILITIES="utility"
217 fi
218
219 global_args=("")
220 configure_args=("")
221
222 if [ -n "${CLI_DEBUG}" ]; then
223 echo "INFO: Writing nvidia-container-cli log at ${CLI_DEBUG}." >&2
224 global_args+=("--debug=${CLI_DEBUG}")
225 fi
226
227 if [ "${CLI_LOAD_KMODS}" = "true" ]; then
228 global_args+=(--load-kmods)
229 fi
230
231 if [ "${USERNS}" = "yes" ]; then
232 global_args+=(--user)
233 configure_args+=(--no-cgroups)
234 fi
235
236 if [ -n "${CLI_LDCACHE}" ]; then
237 global_args+=(--ldcache="${CLI_LDCACHE}")
238 fi
239
240 if [ -n "${CLI_ROOT}" ]; then
241 global_args+=(--root="${CLI_ROOT}")
242 fi
243
244 if [ -n "${CLI_LDCONFIG}" ]; then
245 configure_args+=(--ldconfig="${CLI_LDCONFIG}")
246 fi
247
248 if [ -n "${CLI_DEVICES}" ] && [ "${CLI_DEVICES}" != "none" ]; then
249 configure_args+=(--device="${CLI_DEVICES}")
250 fi
251
252 for cap in ${CLI_CAPABILITIES}; do
253 if arg=$(capability_to_cli "${cap}"); then
254 configure_args+=("${arg}")
255 else
256 echo "ERROR: Unknown driver capability \"${cap}\"." >&2
257 exit 1
258 fi
259 done
260
261 if [ "${CLI_DISABLE_REQUIRE}" = "false" ]; then
262 for req in ${CLI_REQUIREMENTS}; do
263 configure_args+=(--require="${req}")
264 done
265 fi
266
267 if [ -d "/sys/kernel/security/apparmor" ]; then
268 # Try to transition to the unconfined AppArmor profile.
269 echo "changeprofile unconfined" > /proc/self/attr/current || true
270 fi
271
272 set -x
273 exec nvidia-container-cli ${global_args[@]} configure ${configure_args[@]} "${LXC_ROOTFS_MOUNT}"