]> git.proxmox.com Git - qemu-server.git/blame - PVE/QemuServer/PCI.pm
schema: fix description of migrate_downtime parameter
[qemu-server.git] / PVE / QemuServer / PCI.pm
CommitLineData
de9768f0
DC
1package PVE::QemuServer::PCI;
2
41af2dfc
TL
3use warnings;
4use strict;
5
74c17b7a 6use PVE::JSONSchema;
9b71c34d 7use PVE::Mapping::PCI;
74c17b7a 8use PVE::SysFSTools;
3bfee796 9use PVE::Tools;
74c17b7a 10
de9768f0
DC
11use base 'Exporter';
12
13our @EXPORT_OK = qw(
14print_pci_addr
15print_pcie_addr
c4e16381 16print_pcie_root_port
74c17b7a 17parse_hostpci
de9768f0
DC
18);
19
74c17b7a
SR
20our $MAX_HOSTPCI_DEVICES = 16;
21
d8a7e9e8 22my $PCIRE = qr/(?:[a-f0-9]{4,}:)?[a-f0-9]{2}:[a-f0-9]{2}(?:\.[a-f0-9])?/;
74c17b7a
SR
23my $hostpci_fmt = {
24 host => {
25 default_key => 1,
9b71c34d 26 optional => 1,
74c17b7a
SR
27 type => 'string',
28 pattern => qr/$PCIRE(;$PCIRE)*/,
29 format_description => 'HOSTPCIID[;HOSTPCIID2...]',
30 description => <<EODESCR,
31Host PCI device pass through. The PCI ID of a host's PCI device or a list
32of PCI virtual functions of the host. HOSTPCIID syntax is:
33
34'bus:dev.func' (hexadecimal numbers)
35
36You can us the 'lspci' command to list existing PCI devices.
9b71c34d
DC
37
38Either this or the 'mapping' key must be set.
74c17b7a
SR
39EODESCR
40 },
9b71c34d
DC
41 mapping => {
42 optional => 1,
43 type => 'string',
44 format_description => 'mapping-id',
45 format => 'pve-configid',
46 description => "The ID of a cluster wide mapping. Either this or the default-key 'host'"
47 ." must be set.",
48 },
74c17b7a
SR
49 rombar => {
50 type => 'boolean',
1fac3a0b
TL
51 description => "Specify whether or not the device's ROM will be visible in the"
52 ." guest's memory map.",
74c17b7a
SR
53 optional => 1,
54 default => 1,
55 },
56 romfile => {
1fac3a0b
TL
57 type => 'string',
58 pattern => '[^,;]+',
59 format_description => 'string',
60 description => "Custom pci device rom filename (must be located in /usr/share/kvm/).",
61 optional => 1,
74c17b7a
SR
62 },
63 pcie => {
64 type => 'boolean',
1fac3a0b 65 description => "Choose the PCI-express bus (needs the 'q35' machine model).",
74c17b7a
SR
66 optional => 1,
67 default => 0,
68 },
69 'x-vga' => {
70 type => 'boolean',
1fac3a0b 71 description => "Enable vfio-vga device support.",
74c17b7a
SR
72 optional => 1,
73 default => 0,
74 },
13d68979
SR
75 'legacy-igd' => {
76 type => 'boolean',
1fac3a0b
TL
77 description => "Pass this device in legacy IGD mode, making it the primary and exclusive"
78 ." graphics device in the VM. Requires 'pc-i440fx' machine type and VGA set to 'none'.",
13d68979
SR
79 optional => 1,
80 default => 0,
81 },
74c17b7a
SR
82 'mdev' => {
83 type => 'string',
1fac3a0b 84 format_description => 'string',
74c17b7a
SR
85 pattern => '[^/\.:]+',
86 optional => 1,
87 description => <<EODESCR
88The type of mediated device to use.
89An instance of this type will be created on startup of the VM and
90will be cleaned up when the VM stops.
91EODESCR
d806b017
NS
92 },
93 'vendor-id' => {
94 type => 'string',
95 pattern => qr/^0x[0-9a-fA-F]{4}$/,
96 format_description => 'hex id',
97 optional => 1,
98 description => "Override PCI vendor ID visible to guest"
99 },
100 'device-id' => {
101 type => 'string',
102 pattern => qr/^0x[0-9a-fA-F]{4}$/,
103 format_description => 'hex id',
104 optional => 1,
105 description => "Override PCI device ID visible to guest"
106 },
107 'sub-vendor-id' => {
108 type => 'string',
109 pattern => qr/^0x[0-9a-fA-F]{4}$/,
110 format_description => 'hex id',
111 optional => 1,
112 description => "Override PCI subsystem vendor ID visible to guest"
113 },
114 'sub-device-id' => {
115 type => 'string',
116 pattern => qr/^0x[0-9a-fA-F]{4}$/,
117 format_description => 'hex id',
118 optional => 1,
119 description => "Override PCI subsystem device ID visible to guest"
74c17b7a
SR
120 }
121};
122PVE::JSONSchema::register_format('pve-qm-hostpci', $hostpci_fmt);
123
124our $hostpcidesc = {
1fac3a0b
TL
125 optional => 1,
126 type => 'string', format => 'pve-qm-hostpci',
127 description => "Map host PCI devices into guest.",
74c17b7a
SR
128 verbose_description => <<EODESCR,
129Map host PCI devices into guest.
130
131NOTE: This option allows direct access to host hardware. So it is no longer
132possible to migrate such machines - use with special care.
133
134CAUTION: Experimental! User reported problems with this option.
135EODESCR
136};
137PVE::JSONSchema::register_standard_option("pve-qm-hostpci", $hostpcidesc);
138
d7d698f6
TL
139my $pci_addr_map;
140sub get_pci_addr_map {
141 $pci_addr_map = {
142 piix3 => { bus => 0, addr => 1, conflict_ok => qw(ehci) },
143 ehci => { bus => 0, addr => 1, conflict_ok => qw(piix3) }, # instead of piix3 on arm
13d68979
SR
144 vga => { bus => 0, addr => 2, conflict_ok => qw(legacy-igd) },
145 'legacy-igd' => { bus => 0, addr => 2, conflict_ok => qw(vga) }, # legacy-igd requires vga=none
d7d698f6
TL
146 balloon0 => { bus => 0, addr => 3 },
147 watchdog => { bus => 0, addr => 4 },
148 scsihw0 => { bus => 0, addr => 5, conflict_ok => qw(pci.3) },
149 'pci.3' => { bus => 0, addr => 5, conflict_ok => qw(scsihw0) }, # also used for virtio-scsi-single bridge
150 scsihw1 => { bus => 0, addr => 6 },
151 ahci0 => { bus => 0, addr => 7 },
152 qga0 => { bus => 0, addr => 8 },
153 spice => { bus => 0, addr => 9 },
154 virtio0 => { bus => 0, addr => 10 },
155 virtio1 => { bus => 0, addr => 11 },
156 virtio2 => { bus => 0, addr => 12 },
157 virtio3 => { bus => 0, addr => 13 },
158 virtio4 => { bus => 0, addr => 14 },
159 virtio5 => { bus => 0, addr => 15 },
160 hostpci0 => { bus => 0, addr => 16 },
161 hostpci1 => { bus => 0, addr => 17 },
162 net0 => { bus => 0, addr => 18 },
163 net1 => { bus => 0, addr => 19 },
164 net2 => { bus => 0, addr => 20 },
165 net3 => { bus => 0, addr => 21 },
166 net4 => { bus => 0, addr => 22 },
167 net5 => { bus => 0, addr => 23 },
168 vga1 => { bus => 0, addr => 24 },
169 vga2 => { bus => 0, addr => 25 },
170 vga3 => { bus => 0, addr => 26 },
171 hostpci2 => { bus => 0, addr => 27 },
172 hostpci3 => { bus => 0, addr => 28 },
173 #addr29 : usb-host (pve-usb.cfg)
174 'pci.1' => { bus => 0, addr => 30 },
175 'pci.2' => { bus => 0, addr => 31 },
176 'net6' => { bus => 1, addr => 1 },
177 'net7' => { bus => 1, addr => 2 },
178 'net8' => { bus => 1, addr => 3 },
179 'net9' => { bus => 1, addr => 4 },
180 'net10' => { bus => 1, addr => 5 },
181 'net11' => { bus => 1, addr => 6 },
182 'net12' => { bus => 1, addr => 7 },
183 'net13' => { bus => 1, addr => 8 },
184 'net14' => { bus => 1, addr => 9 },
185 'net15' => { bus => 1, addr => 10 },
186 'net16' => { bus => 1, addr => 11 },
187 'net17' => { bus => 1, addr => 12 },
188 'net18' => { bus => 1, addr => 13 },
189 'net19' => { bus => 1, addr => 14 },
190 'net20' => { bus => 1, addr => 15 },
191 'net21' => { bus => 1, addr => 16 },
192 'net22' => { bus => 1, addr => 17 },
193 'net23' => { bus => 1, addr => 18 },
194 'net24' => { bus => 1, addr => 19 },
195 'net25' => { bus => 1, addr => 20 },
196 'net26' => { bus => 1, addr => 21 },
197 'net27' => { bus => 1, addr => 22 },
198 'net28' => { bus => 1, addr => 23 },
199 'net29' => { bus => 1, addr => 24 },
200 'net30' => { bus => 1, addr => 25 },
201 'net31' => { bus => 1, addr => 26 },
202 'xhci' => { bus => 1, addr => 27 },
2513b862 203 'pci.4' => { bus => 1, addr => 28 },
2cf61f33 204 'rng0' => { bus => 1, addr => 29 },
13d68979 205 'pci.2-igd' => { bus => 1, addr => 30 }, # replaces pci.2 in case a legacy IGD device is passed through
d7d698f6
TL
206 'virtio6' => { bus => 2, addr => 1 },
207 'virtio7' => { bus => 2, addr => 2 },
208 'virtio8' => { bus => 2, addr => 3 },
209 'virtio9' => { bus => 2, addr => 4 },
210 'virtio10' => { bus => 2, addr => 5 },
211 'virtio11' => { bus => 2, addr => 6 },
212 'virtio12' => { bus => 2, addr => 7 },
213 'virtio13' => { bus => 2, addr => 8 },
214 'virtio14' => { bus => 2, addr => 9 },
215 'virtio15' => { bus => 2, addr => 10 },
216 'ivshmem' => { bus => 2, addr => 11 },
217 'audio0' => { bus => 2, addr => 12 },
218 hostpci4 => { bus => 2, addr => 13 },
219 hostpci5 => { bus => 2, addr => 14 },
220 hostpci6 => { bus => 2, addr => 15 },
221 hostpci7 => { bus => 2, addr => 16 },
222 hostpci8 => { bus => 2, addr => 17 },
223 hostpci9 => { bus => 2, addr => 18 },
224 hostpci10 => { bus => 2, addr => 19 },
225 hostpci11 => { bus => 2, addr => 20 },
226 hostpci12 => { bus => 2, addr => 21 },
227 hostpci13 => { bus => 2, addr => 22 },
228 hostpci14 => { bus => 2, addr => 23 },
229 hostpci15 => { bus => 2, addr => 24 },
230 'virtioscsi0' => { bus => 3, addr => 1 },
231 'virtioscsi1' => { bus => 3, addr => 2 },
232 'virtioscsi2' => { bus => 3, addr => 3 },
233 'virtioscsi3' => { bus => 3, addr => 4 },
234 'virtioscsi4' => { bus => 3, addr => 5 },
235 'virtioscsi5' => { bus => 3, addr => 6 },
236 'virtioscsi6' => { bus => 3, addr => 7 },
237 'virtioscsi7' => { bus => 3, addr => 8 },
238 'virtioscsi8' => { bus => 3, addr => 9 },
239 'virtioscsi9' => { bus => 3, addr => 10 },
240 'virtioscsi10' => { bus => 3, addr => 11 },
241 'virtioscsi11' => { bus => 3, addr => 12 },
242 'virtioscsi12' => { bus => 3, addr => 13 },
243 'virtioscsi13' => { bus => 3, addr => 14 },
244 'virtioscsi14' => { bus => 3, addr => 15 },
245 'virtioscsi15' => { bus => 3, addr => 16 },
246 'virtioscsi16' => { bus => 3, addr => 17 },
247 'virtioscsi17' => { bus => 3, addr => 18 },
248 'virtioscsi18' => { bus => 3, addr => 19 },
249 'virtioscsi19' => { bus => 3, addr => 20 },
250 'virtioscsi20' => { bus => 3, addr => 21 },
251 'virtioscsi21' => { bus => 3, addr => 22 },
252 'virtioscsi22' => { bus => 3, addr => 23 },
253 'virtioscsi23' => { bus => 3, addr => 24 },
254 'virtioscsi24' => { bus => 3, addr => 25 },
255 'virtioscsi25' => { bus => 3, addr => 26 },
256 'virtioscsi26' => { bus => 3, addr => 27 },
257 'virtioscsi27' => { bus => 3, addr => 28 },
258 'virtioscsi28' => { bus => 3, addr => 29 },
259 'virtioscsi29' => { bus => 3, addr => 30 },
260 'virtioscsi30' => { bus => 3, addr => 31 },
2513b862
DC
261 'scsihw2' => { bus => 4, addr => 1 },
262 'scsihw3' => { bus => 4, addr => 2 },
263 'scsihw4' => { bus => 4, addr => 3 },
d7d698f6
TL
264 } if !defined($pci_addr_map);
265 return $pci_addr_map;
266}
267
bbf96e0f 268sub generate_mdev_uuid {
e2b42bee
TL
269 my ($vmid, $index) = @_;
270 return sprintf("%08d-0000-0000-0000-%012d", $index, $vmid);
271}
272
d7d698f6
TL
273my $get_addr_mapping_from_id = sub {
274 my ($map, $id) = @_;
275
276 my $d = $map->{$id};
d1c1af4b 277 return if !defined($d) || !defined($d->{bus}) || !defined($d->{addr});
d7d698f6
TL
278
279 return { bus => $d->{bus}, addr => sprintf("0x%x", $d->{addr}) };
de9768f0
DC
280};
281
282sub print_pci_addr {
d559309f 283 my ($id, $bridges, $arch, $machine) = @_;
de9768f0
DC
284
285 my $res = '';
286
d7d698f6 287 # using same bus slots on all HW, so we need to check special cases here:
d559309f
WB
288 my $busname = 'pci';
289 if ($arch eq 'aarch64' && $machine =~ /^virt/) {
d7d698f6 290 die "aarch64/virt cannot use IDE devices\n" if $id =~ /^ide/;
d559309f
WB
291 $busname = 'pcie';
292 }
293
d7d698f6
TL
294 my $map = get_pci_addr_map();
295 if (my $d = $get_addr_mapping_from_id->($map, $id)) {
296 $res = ",bus=$busname.$d->{bus},addr=$d->{addr}";
297 $bridges->{$d->{bus}} = 1 if $bridges;
de9768f0 298 }
de9768f0 299
d7d698f6 300 return $res;
de9768f0
DC
301}
302
d7d698f6
TL
303my $pcie_addr_map;
304sub get_pcie_addr_map {
305 $pcie_addr_map = {
55655ebc 306 vga => { bus => 'pcie.0', addr => 1 },
de9768f0
DC
307 hostpci0 => { bus => "ich9-pcie-port-1", addr => 0 },
308 hostpci1 => { bus => "ich9-pcie-port-2", addr => 0 },
309 hostpci2 => { bus => "ich9-pcie-port-3", addr => 0 },
310 hostpci3 => { bus => "ich9-pcie-port-4", addr => 0 },
c4e16381
AL
311 hostpci4 => { bus => "ich9-pcie-port-5", addr => 0 },
312 hostpci5 => { bus => "ich9-pcie-port-6", addr => 0 },
313 hostpci6 => { bus => "ich9-pcie-port-7", addr => 0 },
314 hostpci7 => { bus => "ich9-pcie-port-8", addr => 0 },
315 hostpci8 => { bus => "ich9-pcie-port-9", addr => 0 },
316 hostpci9 => { bus => "ich9-pcie-port-10", addr => 0 },
317 hostpci10 => { bus => "ich9-pcie-port-11", addr => 0 },
318 hostpci11 => { bus => "ich9-pcie-port-12", addr => 0 },
319 hostpci12 => { bus => "ich9-pcie-port-13", addr => 0 },
320 hostpci13 => { bus => "ich9-pcie-port-14", addr => 0 },
321 hostpci14 => { bus => "ich9-pcie-port-15", addr => 0 },
322 hostpci15 => { bus => "ich9-pcie-port-16", addr => 0 },
739ba340
DC
323 # win7 is picky about pcie assignments
324 hostpci0bus0 => { bus => "pcie.0", addr => 16 },
325 hostpci1bus0 => { bus => "pcie.0", addr => 17 },
326 hostpci2bus0 => { bus => "pcie.0", addr => 18 },
327 hostpci3bus0 => { bus => "pcie.0", addr => 19 },
6dbcb073 328 ivshmem => { bus => 'pcie.0', addr => 20 },
c4e16381
AL
329 hostpci4bus0 => { bus => "pcie.0", addr => 9 },
330 hostpci5bus0 => { bus => "pcie.0", addr => 10 },
331 hostpci6bus0 => { bus => "pcie.0", addr => 11 },
332 hostpci7bus0 => { bus => "pcie.0", addr => 12 },
333 hostpci8bus0 => { bus => "pcie.0", addr => 13 },
334 hostpci9bus0 => { bus => "pcie.0", addr => 14 },
335 hostpci10bus0 => { bus => "pcie.0", addr => 15 },
e2b0d85d
TL
336 hostpci11bus0 => { bus => "pcie.0", addr => 21 },
337 hostpci12bus0 => { bus => "pcie.0", addr => 22 },
338 hostpci13bus0 => { bus => "pcie.0", addr => 23 },
339 hostpci14bus0 => { bus => "pcie.0", addr => 24 },
340 hostpci15bus0 => { bus => "pcie.0", addr => 25 },
d7d698f6
TL
341 } if !defined($pcie_addr_map);
342
343 return $pcie_addr_map;
344}
345
346sub print_pcie_addr {
347 my ($id) = @_;
348
349 my $res = '';
de9768f0 350
d7d698f6
TL
351 my $map = get_pcie_addr_map($id);
352 if (my $d = $get_addr_mapping_from_id->($map, $id)) {
353 $res = ",bus=$d->{bus},addr=$d->{addr}";
de9768f0 354 }
de9768f0 355
d7d698f6 356 return $res;
de9768f0 357}
b71351a7 358
c4e16381
AL
359# Generates the device strings for additional pcie root ports. The first 4 pcie
360# root ports are defined in the pve-q35*.cfg files.
361sub print_pcie_root_port {
362 my ($i) = @_;
363 my $res = '';
364
c4e16381 365 my $root_port_addresses = {
e2b0d85d
TL
366 4 => "10.0",
367 5 => "10.1",
368 6 => "10.2",
369 7 => "10.3",
370 8 => "10.4",
371 9 => "10.5",
c4e16381
AL
372 10 => "10.6",
373 11 => "10.7",
374 12 => "11.0",
375 13 => "11.1",
376 14 => "11.2",
377 15 => "11.3",
378 };
379
380 if (defined($root_port_addresses->{$i})) {
e2b0d85d 381 my $id = $i + 1;
c4e16381
AL
382 $res = "pcie-root-port,id=ich9-pcie-port-${id}";
383 $res .= ",addr=$root_port_addresses->{$i}";
384 $res .= ",x-speed=16,x-width=32,multifunction=on,bus=pcie.0";
385 $res .= ",port=${id},chassis=${id}";
386 }
387
388 return $res;
389}
390
9b71c34d
DC
391# returns the parsed pci config but parses the 'host' part into
392# a list if lists into the 'id' property like this:
393#
394# {
395# mdev => 1,
396# rombar => ...
397# ...
398# ids => [
399# # this contains a list of alternative devices,
400# [
401# # which are itself lists of ids for one multifunction device
402# {
403# id => "0000:00:00.0",
404# vendor => "...",
405# },
406# {
407# id => "0000:00:00.1",
408# vendor => "...",
409# },
410# ],
411# [
412# ...
413# ],
414# ...
415# ],
416# }
74c17b7a
SR
417sub parse_hostpci {
418 my ($value) = @_;
419
d1c1af4b 420 return if !$value;
74c17b7a
SR
421
422 my $res = PVE::JSONSchema::parse_property_string($hostpci_fmt, $value);
423
9b71c34d
DC
424 my $alternatives = [];
425 my $host = delete $res->{host};
426 my $mapping = delete $res->{mapping};
427
428 die "Cannot set both 'host' and 'mapping'.\n" if defined($host) && defined($mapping);
429
430 if ($mapping) {
431 # we have no ordinary pci id, must be a mapping
432 my $devices = PVE::Mapping::PCI::find_on_current_node($mapping);
433 die "PCI device mapping not found for '$mapping'\n" if !$devices || !scalar($devices->@*);
434
435 for my $device ($devices->@*) {
436 eval { PVE::Mapping::PCI::assert_valid($mapping, $device) };
437 die "PCI device mapping invalid (hardware probably changed): $@\n" if $@;
438 push $alternatives->@*, [split(/;/, $device->{path})];
439 }
440 } elsif ($host) {
441 push $alternatives->@*, [split(/;/, $host)];
442 } else {
443 die "Either 'host' or 'mapping' must be set.\n";
74c17b7a 444 }
9b71c34d
DC
445
446 $res->{ids} = [];
447 for my $alternative ($alternatives->@*) {
448 my $ids = [];
449 foreach my $id ($alternative->@*) {
450 my $devs = PVE::SysFSTools::lspci($id);
451 die "no PCI device found for '$id'\n" if !scalar($devs->@*);
452 push $ids->@*, @$devs;
453 }
454 if (scalar($ids->@*) > 1) {
455 $res->{'has-multifunction'} = 1;
456 die "cannot use mediated device with multifunction device\n" if $res->{mdev};
457 }
458 push $res->{ids}->@*, $ids;
459 }
460
74c17b7a
SR
461 return $res;
462}
463
9b71c34d
DC
464# parses all hostpci devices from a config and does some sanity checks
465# returns a hash like this:
466# {
467# hostpci0 => {
468# # hash from parse_hostpci function
469# },
470# hostpci1 => { ... },
471# ...
472# }
473sub parse_hostpci_devices {
474 my ($conf) = @_;
475
476 my $q35 = PVE::QemuServer::Machine::machine_type_is_q35($conf);
477 my $legacy_igd = 0;
478
479 my $parsed_devices = {};
480 for (my $i = 0; $i < $MAX_HOSTPCI_DEVICES; $i++) {
481 my $id = "hostpci$i";
482 my $d = parse_hostpci($conf->{$id});
483 next if !$d;
484
485 # check syntax
486 die "q35 machine model is not enabled" if !$q35 && $d->{pcie};
487
488 if ($d->{'legacy-igd'}) {
489 die "only one device can be assigned in legacy-igd mode\n"
490 if $legacy_igd;
491 $legacy_igd = 1;
492
493 die "legacy IGD assignment requires VGA mode to be 'none'\n"
494 if !defined($conf->{'vga'}) || $conf->{'vga'} ne 'none';
495 die "legacy IGD assignment requires rombar to be enabled\n"
496 if defined($d->{rombar}) && !$d->{rombar};
497 die "legacy IGD assignment is not compatible with x-vga\n"
498 if $d->{'x-vga'};
499 die "legacy IGD assignment is not compatible with mdev\n"
500 if $d->{mdev};
501 die "legacy IGD assignment is not compatible with q35\n"
502 if $q35;
503 die "legacy IGD assignment is not compatible with multifunction devices\n"
504 if $d->{'has-multifunction'};
505 die "legacy IGD assignment is not compatible with alternate devices\n"
506 if scalar($d->{ids}->@*) > 1;
507 # check first device for valid id
508 die "legacy IGD assignment only works for devices on host bus 00:02.0\n"
509 if $d->{ids}->[0]->[0]->{id} !~ m/02\.0$/;
510 }
511
512 $parsed_devices->{$id} = $d;
513 }
514
515 return $parsed_devices;
516}
517
518# takes the hash returned by parse_hostpci_devices and for all non mdev gpus,
519# selects one of the given alternatives by trying to reserve it
520#
521# mdev devices must be chosen later when we actually allocate it, but we
522# flatten the inner list since there can only be one device per alternative anyway
523my sub choose_hostpci_devices {
524 my ($devices, $vmid) = @_;
525
526 my $used = {};
527
528 my $add_used_device = sub {
529 my ($devices) = @_;
530 for my $used_device ($devices->@*) {
531 my $used_id = $used_device->{id};
532 die "device '$used_id' assigned more than once\n" if $used->{$used_id};
533 $used->{$used_id} = 1;
534 }
535 };
536
537 for (my $i = 0; $i < $MAX_HOSTPCI_DEVICES; $i++) {
538 my $device = $devices->{"hostpci$i"};
539 next if !$device;
540
541 if ($device->{mdev}) {
542 $device->{ids} = [ map { $_->[0] } $device->{ids}->@* ];
543 next;
544 }
545
546 if (scalar($device->{ids}->@* == 1)) {
547 # we only have one alternative, use that
548 $device->{ids} = $device->{ids}->[0];
549 $add_used_device->($device->{ids});
550 next;
551 }
552
553 my $found = 0;
554 for my $alternative ($device->{ids}->@*) {
555 my $ids = [map { $_->{id} } @$alternative];
556
557 next if grep { defined($used->{$_}) } @$ids; # already used
558 eval { reserve_pci_usage($ids, $vmid, 10, undef) };
559 next if $@;
560
561 # found one that is not used or reserved
562 $add_used_device->($alternative);
563 $device->{ids} = $alternative;
564 $found = 1;
565 last;
566 }
567 die "could not find a free device for 'hostpci$i'\n" if !$found;
568 }
569
570 return $devices;
571}
572
74c17b7a 573sub print_hostpci_devices {
9b71c34d 574 my ($vmid, $conf, $devices, $vga, $winversion, $bridges, $arch, $machine_type, $bootorder) = @_;
74c17b7a
SR
575
576 my $kvm_off = 0;
577 my $gpu_passthrough = 0;
13d68979 578 my $legacy_igd = 0;
74c17b7a 579
f7d1505b 580 my $pciaddr;
9b71c34d
DC
581 my $pci_devices = choose_hostpci_devices(parse_hostpci_devices($conf), $vmid);
582
74c17b7a
SR
583 for (my $i = 0; $i < $MAX_HOSTPCI_DEVICES; $i++) {
584 my $id = "hostpci$i";
9b71c34d 585 my $d = $pci_devices->{$id};
74c17b7a
SR
586 next if !$d;
587
9b71c34d
DC
588 $legacy_igd = 1 if $d->{'legacy-igd'};
589
74c17b7a 590 if (my $pcie = $d->{pcie}) {
74c17b7a
SR
591 # win7 wants to have the pcie devices directly on the pcie bus
592 # instead of in the root port
593 if ($winversion == 7) {
594 $pciaddr = print_pcie_addr("${id}bus0");
595 } else {
596 # add more root ports if needed, 4 are present by default
597 # by pve-q35 cfgs, rest added here on demand.
598 if ($i > 3) {
599 push @$devices, '-device', print_pcie_root_port($i);
600 }
601 $pciaddr = print_pcie_addr($id);
602 }
603 } else {
13d68979
SR
604 my $pci_name = $d->{'legacy-igd'} ? 'legacy-igd' : $id;
605 $pciaddr = print_pci_addr($pci_name, $bridges, $arch, $machine_type);
606 }
607
9b71c34d
DC
608 my $num_devices = scalar($d->{ids}->@*);
609 my $multifunction = $num_devices > 1 && !$d->{mdev};
74c17b7a
SR
610
611 my $xvga = '';
612 if ($d->{'x-vga'}) {
613 $xvga = ',x-vga=on' if !($conf->{bios} && $conf->{bios} eq 'ovmf');
614 $kvm_off = 1;
615 $vga->{type} = 'none' if !defined($conf->{vga});
616 $gpu_passthrough = 1;
617 }
618
74c17b7a 619 my $sysfspath;
9b71c34d 620 if ($d->{mdev}) {
e2b42bee 621 my $uuid = generate_mdev_uuid($vmid, $i);
6fa358a3 622 $sysfspath = "/sys/bus/mdev/devices/$uuid";
74c17b7a
SR
623 }
624
9b71c34d
DC
625 for (my $j = 0; $j < $num_devices; $j++) {
626 my $pcidevice = $d->{ids}->[$j];
74c17b7a
SR
627 my $devicestr = "vfio-pci";
628
629 if ($sysfspath) {
630 $devicestr .= ",sysfsdev=$sysfspath";
631 } else {
632 $devicestr .= ",host=$pcidevice->{id}";
633 }
634
635 my $mf_addr = $multifunction ? ".$j" : '';
636 $devicestr .= ",id=${id}${mf_addr}${pciaddr}${mf_addr}";
637
638 if ($j == 0) {
639 $devicestr .= ',rombar=0' if defined($d->{rombar}) && !$d->{rombar};
640 $devicestr .= "$xvga";
641 $devicestr .= ",multifunction=on" if $multifunction;
642 $devicestr .= ",romfile=/usr/share/kvm/$d->{romfile}" if $d->{romfile};
2141a802 643 $devicestr .= ",bootindex=$bootorder->{$id}" if $bootorder->{$id};
d806b017
NS
644 for my $option (qw(vendor-id device-id sub-vendor-id sub-device-id)) {
645 $devicestr .= ",x-pci-$option=$d->{$option}" if $d->{$option};
646 }
74c17b7a
SR
647 }
648
9b71c34d 649
74c17b7a 650 push @$devices, '-device', $devicestr;
9b71c34d 651 last if $d->{mdev};
74c17b7a
SR
652 }
653 }
654
9b71c34d 655 return ($kvm_off, $gpu_passthrough, $legacy_igd, $pci_devices);
74c17b7a
SR
656}
657
acd4b777 658sub prepare_pci_device {
82712fcd 659 my ($vmid, $pciid, $index, $mdev) = @_;
acd4b777
DC
660
661 my $info = PVE::SysFSTools::pci_device_info("$pciid");
d01de38c 662 die "cannot prepare PCI pass-through, IOMMU not present\n" if !PVE::SysFSTools::check_iommu_support();
acd4b777
DC
663 die "no pci device info for device '$pciid'\n" if !$info;
664
665 if ($mdev) {
e2b42bee 666 my $uuid = generate_mdev_uuid($vmid, $index);
acd4b777
DC
667 PVE::SysFSTools::pci_create_mdev_device($pciid, $uuid, $mdev);
668 } else {
669 die "can't unbind/bind PCI group to VFIO '$pciid'\n"
670 if !PVE::SysFSTools::pci_dev_group_bind_to_vfio($pciid);
671 die "can't reset PCI device '$pciid'\n"
672 if $info->{has_fl_reset} && !PVE::SysFSTools::pci_dev_reset($info);
673 }
bbf96e0f
DC
674
675 return $info;
acd4b777
DC
676}
677
bda0ebff
TL
678my $RUNDIR = '/run/qemu-server';
679my $PCIID_RESERVATION_FILE = "${RUNDIR}/pci-id-reservations";
680my $PCIID_RESERVATION_LOCK = "${PCIID_RESERVATION_FILE}.lock";
3bfee796 681
2fa64dbd
TL
682# a list of PCI ID to VMID reservations, the validity is protected against leakage by either a PID,
683# for succesfully started VM processes, or a expiration time for the initial time window between
684# reservation and actual VM process start-up.
cda95d52 685my $parse_pci_reservation_unlocked = sub {
3bfee796 686 my $pciids = {};
cda95d52 687 if (my $fh = IO::File->new($PCIID_RESERVATION_FILE, "r")) {
3bfee796
DC
688 while (my $line = <$fh>) {
689 if ($line =~ m/^($PCIRE)\s(\d+)\s(time|pid)\:(\d+)$/) {
690 $pciids->{$1} = {
691 vmid => $2,
692 "$3" => $4,
693 };
694 }
695 }
696 }
3bfee796
DC
697 return $pciids;
698};
699
cda95d52 700my $write_pci_reservation_unlocked = sub {
a0159367 701 my ($reservations) = @_;
3bfee796
DC
702
703 my $data = "";
a0159367
TL
704 for my $pci_id (sort keys $reservations->%*) {
705 my ($vmid, $pid, $time) = $reservations->{$pci_id}->@{'vmid', 'pid', 'time'};
706 if (defined($pid)) {
707 $data .= "$pci_id $vmid pid:$pid\n";
3bfee796 708 } else {
a0159367 709 $data .= "$pci_id $vmid time:$time\n";
3bfee796
DC
710 }
711 }
3bfee796
DC
712 PVE::Tools::file_set_contents($PCIID_RESERVATION_FILE, $data);
713};
714
2fa64dbd 715# removes all PCI device reservations held by the `vmid`
3bfee796 716sub remove_pci_reservation {
1b189121 717 my ($vmid) = @_;
3bfee796 718
a0159367
TL
719 PVE::Tools::lock_file($PCIID_RESERVATION_LOCK, 2, sub {
720 my $reservation_list = $parse_pci_reservation_unlocked->();
1b189121
DC
721 for my $id (keys %$reservation_list) {
722 my $reservation = $reservation_list->{$id};
723 next if $reservation->{vmid} != $vmid;
724 delete $reservation_list->{$id};
725 }
a0159367
TL
726 $write_pci_reservation_unlocked->($reservation_list);
727 });
3bfee796 728 die $@ if $@;
3bfee796
DC
729}
730
731sub reserve_pci_usage {
a0159367 732 my ($requested_ids, $vmid, $timeout, $pid) = @_;
3bfee796 733
a0159367
TL
734 $requested_ids = [ $requested_ids ] if !ref($requested_ids);
735 return if !scalar(@$requested_ids); # do nothing for empty list
3bfee796 736
a0159367
TL
737 PVE::Tools::lock_file($PCIID_RESERVATION_LOCK, 5, sub {
738 my $reservation_list = $parse_pci_reservation_unlocked->();
3bfee796
DC
739
740 my $ctime = time();
a0159367
TL
741 for my $id ($requested_ids->@*) {
742 my $reservation = $reservation_list->{$id};
743 if ($reservation && $reservation->{vmid} != $vmid) {
744 # check time based reservation
745 die "PCI device '$id' is currently reserved for use by VMID '$reservation->{vmid}'\n"
746 if defined($reservation->{time}) && $reservation->{time} > $ctime;
747
748 if (my $reserved_pid = $reservation->{pid}) {
3bfee796 749 # check running vm
a0159367
TL
750 my $running_pid = PVE::QemuServer::Helpers::vm_running_locally($reservation->{vmid});
751 if (defined($running_pid) && $running_pid == $reserved_pid) {
752 die "PCI device '$id' already in use by VMID '$reservation->{vmid}'\n";
753 } else {
754 warn "leftover PCI reservation found for $id, lets take it...\n";
3bfee796
DC
755 }
756 }
9b71c34d
DC
757 } elsif ($reservation) {
758 # already reserved by the same vmid
759 if (my $reserved_time = $reservation->{time}) {
760 if (defined($timeout)) {
761 # use the longer timeout
762 my $old_timeout = $reservation->{time} - 5 - $ctime;
763 $timeout = $old_timeout if $old_timeout > $timeout;
764 }
765 } elsif (my $reserved_pid = $reservation->{pid}) {
766 my $running_pid = PVE::QemuServer::Helpers::vm_running_locally($reservation->{vmid});
767 if (defined($running_pid) && $running_pid == $reservation->{pid}) {
768 if (defined($pid)) {
769 die "PCI device '$id' already in use by running VMID '$reservation->{vmid}'\n";
770 } elsif (defined($timeout)) {
771 # ignore timeout reservation for running vms, can happen with e.g.
772 # qm showcmd
773 return;
774 }
775 }
776 }
3bfee796
DC
777 }
778
a0159367
TL
779 $reservation_list->{$id} = { vmid => $vmid };
780 if (defined($pid)) { # VM started up, we can reserve now with the actual PID
781 $reservation_list->{$id}->{pid} = $pid;
782 } elsif (defined($timeout)) { # tempoaray reserve as we don't now the PID yet
783 $reservation_list->{$id}->{time} = $ctime + $timeout + 5;
3bfee796 784 }
3bfee796 785 }
a0159367 786 $write_pci_reservation_unlocked->($reservation_list);
3bfee796
DC
787 });
788 die $@ if $@;
789}
790
b71351a7 7911;