mediated device pass-through: fix race condition on VM reboot

author Dominik Csapak <d.csapak@proxmox.com>

Thu, 7 Mar 2024 09:33:37 +0000 (10:33 +0100)

committer Thomas Lamprecht <t.lamprecht@proxmox.com>

Fri, 8 Mar 2024 13:15:38 +0000 (14:15 +0100)
author Dominik Csapak <d.csapak@proxmox.com>
Thu, 7 Mar 2024 09:33:37 +0000 (10:33 +0100)
committer Thomas Lamprecht <t.lamprecht@proxmox.com>
Fri, 8 Mar 2024 13:15:38 +0000 (14:15 +0100)
diff --git a/PVE/CLI/qm.pm b/PVE/CLI/qm.pm

index b17b4fe25d5bd21e9fe188e82998972b1dc29c36..dce6c7a1a244391dd89a1cc76334bdcd78f2c15a 100755 (executable)
--- a/PVE/CLI/qm.pm
+++ b/PVE/CLI/qm.pm
@@ -915,7 +915,8 @@ __PACKAGE__->register_method({
         my $storecfg = PVE::Storage::config();
         warn "Starting cleanup for $vmid\n";
  
-       PVE::QemuConfig->lock_config($vmid, sub {
+       # mdev cleanup can take a while, so wait up to 60 seconds
+       PVE::QemuConfig->lock_config_full($vmid, 60, sub {
             my $conf = PVE::QemuConfig->load_config ($vmid);
             my $pid = PVE::QemuServer::check_running ($vmid);
             die "vm still running\n" if $pid;
diff --git a/PVE/QemuServer.pm b/PVE/QemuServer.pm

index b45507aab3f2069ac90d3bdaf003a77e713c42ba..7dc63db1f60f71a25aa1caff0be1149e2f4a80c1 100644 (file)
--- a/PVE/QemuServer.pm
+++ b/PVE/QemuServer.pm
@@ -6133,12 +6133,20 @@ sub cleanup_pci_devices {
             my $dev_sysfs_dir = "/sys/bus/mdev/devices/$uuid";
  
             # some nvidia vgpu driver versions want to clean the mdevs up themselves, and error
-           # out when we do it first. so wait for 10 seconds and then try it
-           if ($d->{ids}->[0]->[0]->{vendor} =~ m/^(0x)?10de$/) {
-               sleep 10;
+           # out when we do it first. so wait for up to 10 seconds and then try it manually
+           if ($d->{ids}->[0]->[0]->{vendor} =~ m/^(0x)?10de$/ && -e $dev_sysfs_dir) {
+               my $count = 0;
+               while (-e $dev_sysfs_dir && $count < 10) {
+                   sleep 1;
+                   $count++;
+               }
+               print "waited $count seconds for mediated device driver finishing clean up\n";
             }
  
-           PVE::SysFSTools::file_write("$dev_sysfs_dir/remove", "1") if -e $dev_sysfs_dir;
+           if (-e $dev_sysfs_dir) {
+               print "actively clean up mediated device with UUID $uuid\n";
+               PVE::SysFSTools::file_write("$dev_sysfs_dir/remove", "1");
+           }
         }
      }
      PVE::QemuServer::PCI::remove_pci_reservation($vmid);
author	Dominik Csapak <d.csapak@proxmox.com>
	Thu, 7 Mar 2024 09:33:37 +0000 (10:33 +0100)
committer	Thomas Lamprecht <t.lamprecht@proxmox.com>
	Fri, 8 Mar 2024 13:15:38 +0000 (14:15 +0100)
PVE/CLI/qm.pm		patch \| blob \| blame \| history
PVE/QemuServer.pm		patch \| blob \| blame \| history