From 49c51a60db7f12d7fe2073b755d18b4d9b628fbd Mon Sep 17 00:00:00 2001 From: Dominik Csapak Date: Fri, 24 Feb 2023 14:04:31 +0100 Subject: [PATCH] pci: workaround nvidia driver issue on mdev cleanup in some nvidia grid drivers (e.g. 14.4 and 15.x), their kernel module tries to clean up the mdev device when the vm is shutdown and if it cannot do that (e.g. becaues we already cleaned it up), their removal process cancels with an error such that the vgpu does still exist inside their book-keeping, but can't be used/recreated/freed until a reboot. since there seems no obvious way to detect if thats the case besides either parsing dmesg (which is racy), or the nvidia kernel module version(which i'd rather not do), we simply test the pci device vendor for nvidia and add a 10s sleep. that should give the driver enough time to clean up and we will not find the path anymore and skip the cleanup. This way, it works with both the newer and older versions of the driver (some of the older drivers are LTS releases, so they're still supported). Signed-off-by: Dominik Csapak --- PVE/QemuServer.pm | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/PVE/QemuServer.pm b/PVE/QemuServer.pm index 349cfe4e..b5836f7a 100644 --- a/PVE/QemuServer.pm +++ b/PVE/QemuServer.pm @@ -6166,6 +6166,15 @@ sub cleanup_pci_devices { # NOTE: avoid PVE::SysFSTools::pci_cleanup_mdev_device as it requires PCI ID and we # don't want to break ABI just for this two liner my $dev_sysfs_dir = "/sys/bus/mdev/devices/$uuid"; + + # some nvidia vgpu driver versions want to clean the mdevs up themselves, and error + # out when we do it first. so wait for 10 seconds and then try it + my $pciid = $d->{pciid}->[0]->{id}; + my $info = PVE::SysFSTools::pci_device_info("$pciid"); + if ($info->{vendor} eq '10de') { + sleep 10; + } + PVE::SysFSTools::file_write("$dev_sysfs_dir/remove", "1") if -e $dev_sysfs_dir; } } -- 2.39.5