use PVE::QemuServer::ImportDisk;
use PVE::QemuServer::Monitor qw(mon_cmd);
use PVE::QemuServer::Machine;
-use PVE::QemuServer::USB qw(parse_usb_device);
+use PVE::QemuServer::PCI;
+use PVE::QemuServer::USB;
use PVE::QemuMigrate;
use PVE::RPCEnvironment;
use PVE::AccessControl;
return 1;
};
+my sub check_hostpci_perm {
+ my ($rpcenv, $authuser, $vmid, $pool, $opt, $value) = @_;
+
+ return 1 if $authuser eq 'root@pam';
+
+ my $device = PVE::JSONSchema::parse_property_string('pve-qm-hostpci', $value);
+ if ($device->{host}) {
+ die "only root can set '$opt' config for non-mapped devices\n";
+ } elsif ($device->{mapping}) {
+ $rpcenv->check_full($authuser, "/mapping/pci/$device->{mapping}", ['Mapping.Use']);
+ $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.HWType']);
+ } else {
+ die "either 'host' or 'mapping' must be set.\n";
+ }
+
+ return 1;
+}
+
+my sub check_vm_create_hostpci_perm {
+ my ($rpcenv, $authuser, $vmid, $pool, $param) = @_;
+
+ return 1 if $authuser eq 'root@pam';
+
+ foreach my $opt (keys %{$param}) {
+ next if $opt !~ m/^hostpci\d+$/;
+ check_hostpci_perm($rpcenv, $authuser, $vmid, $pool, $opt, $param->{$opt});
+ }
+
+ return 1;
+};
+
my $check_vm_modify_config_perm = sub {
my ($rpcenv, $authuser, $vmid, $pool, $key_list) = @_;
# else, as there the permission can be value dependend
next if PVE::QemuServer::is_valid_drivename($opt);
next if $opt eq 'cdrom';
- next if $opt =~ m/^(?:unused|serial|usb)\d+$/;
+ next if $opt =~ m/^(?:unused|serial|usb|hostpci)\d+$/;
next if $opt eq 'tags';
# also needs privileges on the storage, that will be checked later
$rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Disk', 'VM.PowerMgmt' ]);
} else {
- # catches hostpci\d+, args, lock, etc.
+ # catches args, lock, etc.
# new options will be checked here
die "only root can set '$opt' config\n";
}
&$check_vm_create_serial_perm($rpcenv, $authuser, $vmid, $pool, $param);
check_vm_create_usb_perm($rpcenv, $authuser, $vmid, $pool, $param);
+ check_vm_create_hostpci_perm($rpcenv, $authuser, $vmid, $pool, $param);
PVE::QemuServer::check_bridge_access($rpcenv, $authuser, $param);
&$check_cpu_model_access($rpcenv, $authuser, $param);
check_usb_perm($rpcenv, $authuser, $vmid, undef, $opt, $val);
PVE::QemuConfig->add_to_pending_delete($conf, $opt, $force);
PVE::QemuConfig->write_config($vmid, $conf);
+ } elsif ($opt =~ m/^hostpci\d+$/) {
+ check_hostpci_perm($rpcenv, $authuser, $vmid, undef, $opt, $val);
+ PVE::QemuConfig->add_to_pending_delete($conf, $opt, $force);
+ PVE::QemuConfig->write_config($vmid, $conf);
} elsif ($opt eq 'tags') {
assert_tag_permissions($vmid, $val, '', $rpcenv, $authuser);
delete $conf->{$opt};
}
check_usb_perm($rpcenv, $authuser, $vmid, undef, $opt, $param->{$opt});
$conf->{pending}->{$opt} = $param->{$opt};
+ } elsif ($opt =~ m/^hostpci\d+$/) {
+ if (my $oldvalue = $conf->{$opt}) {
+ check_hostpci_perm($rpcenv, $authuser, $vmid, undef, $opt, $oldvalue);
+ }
+ check_hostpci_perm($rpcenv, $authuser, $vmid, undef, $opt, $param->{$opt});
+ $conf->{pending}->{$opt} = $param->{$opt};
} elsif ($opt eq 'tags') {
assert_tag_permissions($vmid, $conf->{$opt}, $param->{$opt}, $rpcenv, $authuser);
$conf->{pending}->{$opt} = PVE::GuestHelpers::get_unique_tags($param->{$opt});
my $bootorder = device_bootorder($conf);
# host pci device passthrough
- my ($kvm_off, $gpu_passthrough, $legacy_igd) = PVE::QemuServer::PCI::print_hostpci_devices(
- $vmid, $conf, $devices, $vga, $winversion, $q35, $bridges, $arch, $machine_type, $bootorder);
+ my ($kvm_off, $gpu_passthrough, $legacy_igd, $pci_devices) = PVE::QemuServer::PCI::print_hostpci_devices(
+ $vmid, $conf, $devices, $vga, $winversion, $bridges, $arch, $machine_type, $bootorder);
# usb devices
my $usb_dev_features = {};
push @$cmd, @$aa;
}
- return wantarray ? ($cmd, $vollist, $spice_port) : $cmd;
+ return wantarray ? ($cmd, $vollist, $spice_port, $pci_devices) : $cmd;
}
sub check_rng_source {
print "Resuming suspended VM\n";
}
- my ($cmd, $vollist, $spice_port) = config_to_command($storecfg, $vmid,
+ my ($cmd, $vollist, $spice_port, $pci_devices) = config_to_command($storecfg, $vmid,
$conf, $defaults, $forcemachine, $forcecpu, $params->{'pbs-backing'});
my $migration_ip;
my $start_timeout = $params->{timeout} // config_aware_timeout($conf, $resume);
- my $pci_devices = {}; # host pci devices
- for (my $i = 0; $i < $PVE::QemuServer::PCI::MAX_HOSTPCI_DEVICES; $i++) {
- my $dev = $conf->{"hostpci$i"} or next;
- $pci_devices->{$i} = parse_hostpci($dev);
+ my $pci_reserve_list = [];
+ for my $device (values $pci_devices->%*) {
+ next if $device->{mdev}; # we don't reserve for mdev devices
+ push $pci_reserve_list->@*, map { $_->{id} } $device->{ids}->@*;
}
- # do not reserve pciid for mediated devices, sysfs will error out for duplicate assignment
- my $real_pci_devices = [ grep { !(defined($_->{mdev}) && scalar($_->{pciid}->@*) == 1) } values $pci_devices->%* ];
-
- # map to a flat list of pci ids
- my $pci_id_list = [ map { $_->{id} } map { $_->{pciid}->@* } $real_pci_devices->@* ];
-
# reserve all PCI IDs before actually doing anything with them
- PVE::QemuServer::PCI::reserve_pci_usage($pci_id_list, $vmid, $start_timeout);
+ PVE::QemuServer::PCI::reserve_pci_usage($pci_reserve_list, $vmid, $start_timeout);
eval {
my $uuid;
for my $id (sort keys %$pci_devices) {
my $d = $pci_devices->{$id};
- for my $dev ($d->{pciid}->@*) {
- my $info = PVE::QemuServer::PCI::prepare_pci_device($vmid, $dev->{id}, $id, $d->{mdev});
-
- # nvidia grid needs the qemu parameter '-uuid' set
- # use smbios uuid or mdev uuid as fallback for that
- if ($d->{mdev} && !defined($uuid) && $info->{vendor} eq '10de') {
- if (defined($conf->{smbios1})) {
- my $smbios_conf = parse_smbios1($conf->{smbios1});
- $uuid = $smbios_conf->{uuid} if defined($smbios_conf->{uuid});
- }
- $uuid = PVE::QemuServer::PCI::generate_mdev_uuid($vmid, $id) if !defined($uuid);
+ my ($index) = ($id =~ m/^hostpci(\d+)$/);
+
+ my $chosen_mdev;
+ for my $dev ($d->{ids}->@*) {
+ my $info = eval { PVE::QemuServer::PCI::prepare_pci_device($vmid, $dev->{id}, $index, $d->{mdev}) };
+ if ($d->{mdev}) {
+ warn $@ if $@;
+ $chosen_mdev = $info;
+ last if $chosen_mdev; # if successful, we're done
+ } else {
+ die $@ if $@;
+ }
+ }
+
+ next if !$d->{mdev};
+ die "could not create mediated device\n" if !defined($chosen_mdev);
+
+ # nvidia grid needs the uuid of the mdev as qemu parameter
+ if (!defined($uuid) && $chosen_mdev->{vendor} =~ m/^(0x)?10de$/) {
+ if (defined($conf->{smbios1})) {
+ my $smbios_conf = parse_smbios1($conf->{smbios1});
+ $uuid = $smbios_conf->{uuid} if defined($smbios_conf->{uuid});
}
+ $uuid = PVE::QemuServer::PCI::generate_mdev_uuid($vmid, $index) if !defined($uuid);
}
}
push @$cmd, '-uuid', $uuid if defined($uuid);
# re-reserve all PCI IDs now that we can know the actual VM PID
my $pid = PVE::QemuServer::Helpers::vm_running_locally($vmid);
- eval { PVE::QemuServer::PCI::reserve_pci_usage($pci_id_list, $vmid, undef, $pid) };
+ eval { PVE::QemuServer::PCI::reserve_pci_usage($pci_reserve_list, $vmid, undef, $pid) };
warn $@ if $@;
if (defined($res->{migrate})) {
# some nvidia vgpu driver versions want to clean the mdevs up themselves, and error
# out when we do it first. so wait for 10 seconds and then try it
- my $pciid = $d->{pciid}->[0]->{id};
- my $info = PVE::SysFSTools::pci_device_info("$pciid");
- if ($info->{vendor} eq '10de') {
+ if ($d->{ids}->[0]->[0]->{vendor} =~ m/^(0x)?10de$/) {
sleep 10;
}
} else {
die "either 'host' or 'mapping' must be set.\n";
}
+ } elsif ($opt =~ m/^hostpci\d+$/) {
+ my $device = PVE::JSONSchema::parse_property_string('pve-qm-hostpci', $conf->{$opt});
+ if ($device->{host}) {
+ die "only root can set '$opt' config for non-mapped devices\n" if $user ne 'root@pam';
+ } elsif ($device->{mapping}) {
+ $rpcenv->check_full($user, "/mapping/pci/$device->{mapping}", ['Mapping.Use']);
+ } else {
+ die "either 'host' or 'mapping' must be set.\n";
+ }
}
}
};
use strict;
use PVE::JSONSchema;
+use PVE::Mapping::PCI;
use PVE::SysFSTools;
use PVE::Tools;
my $hostpci_fmt = {
host => {
default_key => 1,
+ optional => 1,
type => 'string',
pattern => qr/$PCIRE(;$PCIRE)*/,
format_description => 'HOSTPCIID[;HOSTPCIID2...]',
'bus:dev.func' (hexadecimal numbers)
You can us the 'lspci' command to list existing PCI devices.
+
+Either this or the 'mapping' key must be set.
EODESCR
},
+ mapping => {
+ optional => 1,
+ type => 'string',
+ format_description => 'mapping-id',
+ format => 'pve-configid',
+ description => "The ID of a cluster wide mapping. Either this or the default-key 'host'"
+ ." must be set.",
+ },
rombar => {
type => 'boolean',
description => "Specify whether or not the device's ROM will be visible in the"
return $res;
}
+# returns the parsed pci config but parses the 'host' part into
+# a list if lists into the 'id' property like this:
+#
+# {
+# mdev => 1,
+# rombar => ...
+# ...
+# ids => [
+# # this contains a list of alternative devices,
+# [
+# # which are itself lists of ids for one multifunction device
+# {
+# id => "0000:00:00.0",
+# vendor => "...",
+# },
+# {
+# id => "0000:00:00.1",
+# vendor => "...",
+# },
+# ],
+# [
+# ...
+# ],
+# ...
+# ],
+# }
sub parse_hostpci {
my ($value) = @_;
my $res = PVE::JSONSchema::parse_property_string($hostpci_fmt, $value);
- my @idlist = split(/;/, $res->{host});
- delete $res->{host};
- foreach my $id (@idlist) {
- my $devs = PVE::SysFSTools::lspci($id);
- die "no PCI device found for '$id'\n" if !scalar(@$devs);
- push @{$res->{pciid}}, @$devs;
+ my $alternatives = [];
+ my $host = delete $res->{host};
+ my $mapping = delete $res->{mapping};
+
+ die "Cannot set both 'host' and 'mapping'.\n" if defined($host) && defined($mapping);
+
+ if ($mapping) {
+ # we have no ordinary pci id, must be a mapping
+ my $devices = PVE::Mapping::PCI::find_on_current_node($mapping);
+ die "PCI device mapping not found for '$mapping'\n" if !$devices || !scalar($devices->@*);
+
+ for my $device ($devices->@*) {
+ eval { PVE::Mapping::PCI::assert_valid($mapping, $device) };
+ die "PCI device mapping invalid (hardware probably changed): $@\n" if $@;
+ push $alternatives->@*, [split(/;/, $device->{path})];
+ }
+ } elsif ($host) {
+ push $alternatives->@*, [split(/;/, $host)];
+ } else {
+ die "Either 'host' or 'mapping' must be set.\n";
}
+
+ $res->{ids} = [];
+ for my $alternative ($alternatives->@*) {
+ my $ids = [];
+ foreach my $id ($alternative->@*) {
+ my $devs = PVE::SysFSTools::lspci($id);
+ die "no PCI device found for '$id'\n" if !scalar($devs->@*);
+ push $ids->@*, @$devs;
+ }
+ if (scalar($ids->@*) > 1) {
+ $res->{'has-multifunction'} = 1;
+ die "cannot use mediated device with multifunction device\n" if $res->{mdev};
+ }
+ push $res->{ids}->@*, $ids;
+ }
+
return $res;
}
+# parses all hostpci devices from a config and does some sanity checks
+# returns a hash like this:
+# {
+# hostpci0 => {
+# # hash from parse_hostpci function
+# },
+# hostpci1 => { ... },
+# ...
+# }
+sub parse_hostpci_devices {
+ my ($conf) = @_;
+
+ my $q35 = PVE::QemuServer::Machine::machine_type_is_q35($conf);
+ my $legacy_igd = 0;
+
+ my $parsed_devices = {};
+ for (my $i = 0; $i < $MAX_HOSTPCI_DEVICES; $i++) {
+ my $id = "hostpci$i";
+ my $d = parse_hostpci($conf->{$id});
+ next if !$d;
+
+ # check syntax
+ die "q35 machine model is not enabled" if !$q35 && $d->{pcie};
+
+ if ($d->{'legacy-igd'}) {
+ die "only one device can be assigned in legacy-igd mode\n"
+ if $legacy_igd;
+ $legacy_igd = 1;
+
+ die "legacy IGD assignment requires VGA mode to be 'none'\n"
+ if !defined($conf->{'vga'}) || $conf->{'vga'} ne 'none';
+ die "legacy IGD assignment requires rombar to be enabled\n"
+ if defined($d->{rombar}) && !$d->{rombar};
+ die "legacy IGD assignment is not compatible with x-vga\n"
+ if $d->{'x-vga'};
+ die "legacy IGD assignment is not compatible with mdev\n"
+ if $d->{mdev};
+ die "legacy IGD assignment is not compatible with q35\n"
+ if $q35;
+ die "legacy IGD assignment is not compatible with multifunction devices\n"
+ if $d->{'has-multifunction'};
+ die "legacy IGD assignment is not compatible with alternate devices\n"
+ if scalar($d->{ids}->@*) > 1;
+ # check first device for valid id
+ die "legacy IGD assignment only works for devices on host bus 00:02.0\n"
+ if $d->{ids}->[0]->[0]->{id} !~ m/02\.0$/;
+ }
+
+ $parsed_devices->{$id} = $d;
+ }
+
+ return $parsed_devices;
+}
+
+# takes the hash returned by parse_hostpci_devices and for all non mdev gpus,
+# selects one of the given alternatives by trying to reserve it
+#
+# mdev devices must be chosen later when we actually allocate it, but we
+# flatten the inner list since there can only be one device per alternative anyway
+my sub choose_hostpci_devices {
+ my ($devices, $vmid) = @_;
+
+ my $used = {};
+
+ my $add_used_device = sub {
+ my ($devices) = @_;
+ for my $used_device ($devices->@*) {
+ my $used_id = $used_device->{id};
+ die "device '$used_id' assigned more than once\n" if $used->{$used_id};
+ $used->{$used_id} = 1;
+ }
+ };
+
+ for (my $i = 0; $i < $MAX_HOSTPCI_DEVICES; $i++) {
+ my $device = $devices->{"hostpci$i"};
+ next if !$device;
+
+ if ($device->{mdev}) {
+ $device->{ids} = [ map { $_->[0] } $device->{ids}->@* ];
+ next;
+ }
+
+ if (scalar($device->{ids}->@* == 1)) {
+ # we only have one alternative, use that
+ $device->{ids} = $device->{ids}->[0];
+ $add_used_device->($device->{ids});
+ next;
+ }
+
+ my $found = 0;
+ for my $alternative ($device->{ids}->@*) {
+ my $ids = [map { $_->{id} } @$alternative];
+
+ next if grep { defined($used->{$_}) } @$ids; # already used
+ eval { reserve_pci_usage($ids, $vmid, 10, undef) };
+ next if $@;
+
+ # found one that is not used or reserved
+ $add_used_device->($alternative);
+ $device->{ids} = $alternative;
+ $found = 1;
+ last;
+ }
+ die "could not find a free device for 'hostpci$i'\n" if !$found;
+ }
+
+ return $devices;
+}
+
sub print_hostpci_devices {
- my ($vmid, $conf, $devices, $vga, $winversion, $q35, $bridges, $arch, $machine_type, $bootorder) = @_;
+ my ($vmid, $conf, $devices, $vga, $winversion, $bridges, $arch, $machine_type, $bootorder) = @_;
my $kvm_off = 0;
my $gpu_passthrough = 0;
my $legacy_igd = 0;
my $pciaddr;
+ my $pci_devices = choose_hostpci_devices(parse_hostpci_devices($conf), $vmid);
+
for (my $i = 0; $i < $MAX_HOSTPCI_DEVICES; $i++) {
my $id = "hostpci$i";
- my $d = parse_hostpci($conf->{$id});
+ my $d = $pci_devices->{$id};
next if !$d;
+ $legacy_igd = 1 if $d->{'legacy-igd'};
+
if (my $pcie = $d->{pcie}) {
- die "q35 machine model is not enabled" if !$q35;
# win7 wants to have the pcie devices directly on the pcie bus
# instead of in the root port
if ($winversion == 7) {
$pciaddr = print_pci_addr($pci_name, $bridges, $arch, $machine_type);
}
- my $pcidevices = $d->{pciid};
- my $multifunction = @$pcidevices > 1;
-
- if ($d->{'legacy-igd'}) {
- die "only one device can be assigned in legacy-igd mode\n"
- if $legacy_igd;
- $legacy_igd = 1;
-
- die "legacy IGD assignment requires VGA mode to be 'none'\n"
- if !defined($conf->{'vga'}) || $conf->{'vga'} ne 'none';
- die "legacy IGD assignment requires rombar to be enabled\n"
- if defined($d->{rombar}) && !$d->{rombar};
- die "legacy IGD assignment is not compatible with x-vga\n"
- if $d->{'x-vga'};
- die "legacy IGD assignment is not compatible with mdev\n"
- if $d->{mdev};
- die "legacy IGD assignment is not compatible with q35\n"
- if $q35;
- die "legacy IGD assignment is not compatible with multifunction devices\n"
- if $multifunction;
- die "legacy IGD assignment only works for devices on host bus 00:02.0\n"
- if $pcidevices->[0]->{id} !~ m/02\.0$/;
- }
+ my $num_devices = scalar($d->{ids}->@*);
+ my $multifunction = $num_devices > 1 && !$d->{mdev};
my $xvga = '';
if ($d->{'x-vga'}) {
}
my $sysfspath;
- if ($d->{mdev} && scalar(@$pcidevices) == 1) {
+ if ($d->{mdev}) {
my $uuid = generate_mdev_uuid($vmid, $i);
$sysfspath = "/sys/bus/mdev/devices/$uuid";
- } elsif ($d->{mdev}) {
- warn "ignoring mediated device '$id' with multifunction device\n";
}
- my $j = 0;
- foreach my $pcidevice (@$pcidevices) {
+ for (my $j = 0; $j < $num_devices; $j++) {
+ my $pcidevice = $d->{ids}->[$j];
my $devicestr = "vfio-pci";
if ($sysfspath) {
}
}
+
push @$devices, '-device', $devicestr;
- $j++;
+ last if $d->{mdev};
}
}
- return ($kvm_off, $gpu_passthrough, $legacy_igd);
+ return ($kvm_off, $gpu_passthrough, $legacy_igd, $pci_devices);
}
sub prepare_pci_device {
warn "leftover PCI reservation found for $id, lets take it...\n";
}
}
+ } elsif ($reservation) {
+ # already reserved by the same vmid
+ if (my $reserved_time = $reservation->{time}) {
+ if (defined($timeout)) {
+ # use the longer timeout
+ my $old_timeout = $reservation->{time} - 5 - $ctime;
+ $timeout = $old_timeout if $old_timeout > $timeout;
+ }
+ } elsif (my $reserved_pid = $reservation->{pid}) {
+ my $running_pid = PVE::QemuServer::Helpers::vm_running_locally($reservation->{vmid});
+ if (defined($running_pid) && $running_pid == $reservation->{pid}) {
+ if (defined($pid)) {
+ die "PCI device '$id' already in use by running VMID '$reservation->{vmid}'\n";
+ } elsif (defined($timeout)) {
+ # ignore timeout reservation for running vms, can happen with e.g.
+ # qm showcmd
+ return;
+ }
+ }
+ }
}
$reservation_list->{$id} = { vmid => $vmid };
hostpci2: 00:f4.0
hostpci3: d0:15.1,pcie=1
hostpci4: d0:17.0,pcie=1,rombar=0
-hostpci7: d0:15.1,pcie=1
+hostpci7: d0:15.2,pcie=1
machine: q35
memory: 512
net0: virtio=2E:01:68:F9:9C:87,bridge=vmbr0
-device 'pcie-root-port,id=ich9-pcie-port-5,addr=10.0,x-speed=16,x-width=32,multifunction=on,bus=pcie.0,port=5,chassis=5' \
-device 'vfio-pci,host=0000:d0:17.0,id=hostpci4,bus=ich9-pcie-port-5,addr=0x0,rombar=0' \
-device 'pcie-root-port,id=ich9-pcie-port-8,addr=10.3,x-speed=16,x-width=32,multifunction=on,bus=pcie.0,port=8,chassis=8' \
- -device 'vfio-pci,host=0000:d0:15.1,id=hostpci7,bus=ich9-pcie-port-8,addr=0x0' \
+ -device 'vfio-pci,host=0000:d0:15.2,id=hostpci7,bus=ich9-pcie-port-8,addr=0x0' \
-device 'VGA,id=vga,bus=pcie.0,addr=0x1' \
-device 'virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x3,free-page-reporting=on' \
-iscsi 'initiator-name=iqn.1993-08.org.debian:01:aabbccddeeff' \
"0000:0f:f2.0",
"0000:d0:13.0",
"0000:d0:15.1",
+ "0000:d0:15.2",
"0000:d0:17.0",
"0000:f0:42.0",
"0000:f0:43.0",