]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - drivers/pci/host/pci-hyperv.c
c2de1ea95ef492788b0aa8939650f5e3a6c2c2c9
[mirror_ubuntu-bionic-kernel.git] / drivers / pci / host / pci-hyperv.c
1 /*
2 * Copyright (c) Microsoft Corporation.
3 *
4 * Author:
5 * Jake Oshins <jakeo@microsoft.com>
6 *
7 * This driver acts as a paravirtual front-end for PCI Express root buses.
8 * When a PCI Express function (either an entire device or an SR-IOV
9 * Virtual Function) is being passed through to the VM, this driver exposes
10 * a new bus to the guest VM. This is modeled as a root PCI bus because
11 * no bridges are being exposed to the VM. In fact, with a "Generation 2"
12 * VM within Hyper-V, there may seem to be no PCI bus at all in the VM
13 * until a device as been exposed using this driver.
14 *
15 * Each root PCI bus has its own PCI domain, which is called "Segment" in
16 * the PCI Firmware Specifications. Thus while each device passed through
17 * to the VM using this front-end will appear at "device 0", the domain will
18 * be unique. Typically, each bus will have one PCI function on it, though
19 * this driver does support more than one.
20 *
21 * In order to map the interrupts from the device through to the guest VM,
22 * this driver also implements an IRQ Domain, which handles interrupts (either
23 * MSI or MSI-X) associated with the functions on the bus. As interrupts are
24 * set up, torn down, or reaffined, this driver communicates with the
25 * underlying hypervisor to adjust the mappings in the I/O MMU so that each
26 * interrupt will be delivered to the correct virtual processor at the right
27 * vector. This driver does not support level-triggered (line-based)
28 * interrupts, and will report that the Interrupt Line register in the
29 * function's configuration space is zero.
30 *
31 * The rest of this driver mostly maps PCI concepts onto underlying Hyper-V
32 * facilities. For instance, the configuration space of a function exposed
33 * by Hyper-V is mapped into a single page of memory space, and the
34 * read and write handlers for config space must be aware of this mechanism.
35 * Similarly, device setup and teardown involves messages sent to and from
36 * the PCI back-end driver in Hyper-V.
37 *
38 * This program is free software; you can redistribute it and/or modify it
39 * under the terms of the GNU General Public License version 2 as published
40 * by the Free Software Foundation.
41 *
42 * This program is distributed in the hope that it will be useful, but
43 * WITHOUT ANY WARRANTY; without even the implied warranty of
44 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
45 * NON INFRINGEMENT. See the GNU General Public License for more
46 * details.
47 *
48 */
49
50 #include <linux/kernel.h>
51 #include <linux/module.h>
52 #include <linux/pci.h>
53 #include <linux/delay.h>
54 #include <linux/semaphore.h>
55 #include <linux/irqdomain.h>
56 #include <asm/irqdomain.h>
57 #include <asm/apic.h>
58 #include <linux/irq.h>
59 #include <linux/msi.h>
60 #include <linux/hyperv.h>
61 #include <linux/refcount.h>
62 #include <asm/mshyperv.h>
63
64 /*
65 * Protocol versions. The low word is the minor version, the high word the
66 * major version.
67 */
68
69 #define PCI_MAKE_VERSION(major, minor) ((u32)(((major) << 16) | (minor)))
70 #define PCI_MAJOR_VERSION(version) ((u32)(version) >> 16)
71 #define PCI_MINOR_VERSION(version) ((u32)(version) & 0xff)
72
73 enum pci_protocol_version_t {
74 PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1), /* Win10 */
75 PCI_PROTOCOL_VERSION_1_2 = PCI_MAKE_VERSION(1, 2), /* RS1 */
76 };
77
78 #define CPU_AFFINITY_ALL -1ULL
79
80 /*
81 * Supported protocol versions in the order of probing - highest go
82 * first.
83 */
84 static enum pci_protocol_version_t pci_protocol_versions[] = {
85 PCI_PROTOCOL_VERSION_1_2,
86 PCI_PROTOCOL_VERSION_1_1,
87 };
88
89 /*
90 * Protocol version negotiated by hv_pci_protocol_negotiation().
91 */
92 static enum pci_protocol_version_t pci_protocol_version;
93
94 #define PCI_CONFIG_MMIO_LENGTH 0x2000
95 #define CFG_PAGE_OFFSET 0x1000
96 #define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET)
97
98 #define MAX_SUPPORTED_MSI_MESSAGES 0x400
99
100 #define STATUS_REVISION_MISMATCH 0xC0000059
101
102 /*
103 * Message Types
104 */
105
106 enum pci_message_type {
107 /*
108 * Version 1.1
109 */
110 PCI_MESSAGE_BASE = 0x42490000,
111 PCI_BUS_RELATIONS = PCI_MESSAGE_BASE + 0,
112 PCI_QUERY_BUS_RELATIONS = PCI_MESSAGE_BASE + 1,
113 PCI_POWER_STATE_CHANGE = PCI_MESSAGE_BASE + 4,
114 PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5,
115 PCI_QUERY_RESOURCE_RESOURCES = PCI_MESSAGE_BASE + 6,
116 PCI_BUS_D0ENTRY = PCI_MESSAGE_BASE + 7,
117 PCI_BUS_D0EXIT = PCI_MESSAGE_BASE + 8,
118 PCI_READ_BLOCK = PCI_MESSAGE_BASE + 9,
119 PCI_WRITE_BLOCK = PCI_MESSAGE_BASE + 0xA,
120 PCI_EJECT = PCI_MESSAGE_BASE + 0xB,
121 PCI_QUERY_STOP = PCI_MESSAGE_BASE + 0xC,
122 PCI_REENABLE = PCI_MESSAGE_BASE + 0xD,
123 PCI_QUERY_STOP_FAILED = PCI_MESSAGE_BASE + 0xE,
124 PCI_EJECTION_COMPLETE = PCI_MESSAGE_BASE + 0xF,
125 PCI_RESOURCES_ASSIGNED = PCI_MESSAGE_BASE + 0x10,
126 PCI_RESOURCES_RELEASED = PCI_MESSAGE_BASE + 0x11,
127 PCI_INVALIDATE_BLOCK = PCI_MESSAGE_BASE + 0x12,
128 PCI_QUERY_PROTOCOL_VERSION = PCI_MESSAGE_BASE + 0x13,
129 PCI_CREATE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x14,
130 PCI_DELETE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x15,
131 PCI_RESOURCES_ASSIGNED2 = PCI_MESSAGE_BASE + 0x16,
132 PCI_CREATE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x17,
133 PCI_DELETE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x18, /* unused */
134 PCI_MESSAGE_MAXIMUM
135 };
136
137 /*
138 * Structures defining the virtual PCI Express protocol.
139 */
140
141 union pci_version {
142 struct {
143 u16 minor_version;
144 u16 major_version;
145 } parts;
146 u32 version;
147 } __packed;
148
149 /*
150 * Function numbers are 8-bits wide on Express, as interpreted through ARI,
151 * which is all this driver does. This representation is the one used in
152 * Windows, which is what is expected when sending this back and forth with
153 * the Hyper-V parent partition.
154 */
155 union win_slot_encoding {
156 struct {
157 u32 dev:5;
158 u32 func:3;
159 u32 reserved:24;
160 } bits;
161 u32 slot;
162 } __packed;
163
164 /*
165 * Pretty much as defined in the PCI Specifications.
166 */
167 struct pci_function_description {
168 u16 v_id; /* vendor ID */
169 u16 d_id; /* device ID */
170 u8 rev;
171 u8 prog_intf;
172 u8 subclass;
173 u8 base_class;
174 u32 subsystem_id;
175 union win_slot_encoding win_slot;
176 u32 ser; /* serial number */
177 } __packed;
178
179 /**
180 * struct hv_msi_desc
181 * @vector: IDT entry
182 * @delivery_mode: As defined in Intel's Programmer's
183 * Reference Manual, Volume 3, Chapter 8.
184 * @vector_count: Number of contiguous entries in the
185 * Interrupt Descriptor Table that are
186 * occupied by this Message-Signaled
187 * Interrupt. For "MSI", as first defined
188 * in PCI 2.2, this can be between 1 and
189 * 32. For "MSI-X," as first defined in PCI
190 * 3.0, this must be 1, as each MSI-X table
191 * entry would have its own descriptor.
192 * @reserved: Empty space
193 * @cpu_mask: All the target virtual processors.
194 */
195 struct hv_msi_desc {
196 u8 vector;
197 u8 delivery_mode;
198 u16 vector_count;
199 u32 reserved;
200 u64 cpu_mask;
201 } __packed;
202
203 /**
204 * struct hv_msi_desc2 - 1.2 version of hv_msi_desc
205 * @vector: IDT entry
206 * @delivery_mode: As defined in Intel's Programmer's
207 * Reference Manual, Volume 3, Chapter 8.
208 * @vector_count: Number of contiguous entries in the
209 * Interrupt Descriptor Table that are
210 * occupied by this Message-Signaled
211 * Interrupt. For "MSI", as first defined
212 * in PCI 2.2, this can be between 1 and
213 * 32. For "MSI-X," as first defined in PCI
214 * 3.0, this must be 1, as each MSI-X table
215 * entry would have its own descriptor.
216 * @processor_count: number of bits enabled in array.
217 * @processor_array: All the target virtual processors.
218 */
219 struct hv_msi_desc2 {
220 u8 vector;
221 u8 delivery_mode;
222 u16 vector_count;
223 u16 processor_count;
224 u16 processor_array[32];
225 } __packed;
226
227 /**
228 * struct tran_int_desc
229 * @reserved: unused, padding
230 * @vector_count: same as in hv_msi_desc
231 * @data: This is the "data payload" value that is
232 * written by the device when it generates
233 * a message-signaled interrupt, either MSI
234 * or MSI-X.
235 * @address: This is the address to which the data
236 * payload is written on interrupt
237 * generation.
238 */
239 struct tran_int_desc {
240 u16 reserved;
241 u16 vector_count;
242 u32 data;
243 u64 address;
244 } __packed;
245
246 /*
247 * A generic message format for virtual PCI.
248 * Specific message formats are defined later in the file.
249 */
250
251 struct pci_message {
252 u32 type;
253 } __packed;
254
255 struct pci_child_message {
256 struct pci_message message_type;
257 union win_slot_encoding wslot;
258 } __packed;
259
260 struct pci_incoming_message {
261 struct vmpacket_descriptor hdr;
262 struct pci_message message_type;
263 } __packed;
264
265 struct pci_response {
266 struct vmpacket_descriptor hdr;
267 s32 status; /* negative values are failures */
268 } __packed;
269
270 struct pci_packet {
271 void (*completion_func)(void *context, struct pci_response *resp,
272 int resp_packet_size);
273 void *compl_ctxt;
274
275 struct pci_message message[0];
276 };
277
278 /*
279 * Specific message types supporting the PCI protocol.
280 */
281
282 /*
283 * Version negotiation message. Sent from the guest to the host.
284 * The guest is free to try different versions until the host
285 * accepts the version.
286 *
287 * pci_version: The protocol version requested.
288 * is_last_attempt: If TRUE, this is the last version guest will request.
289 * reservedz: Reserved field, set to zero.
290 */
291
292 struct pci_version_request {
293 struct pci_message message_type;
294 u32 protocol_version;
295 } __packed;
296
297 /*
298 * Bus D0 Entry. This is sent from the guest to the host when the virtual
299 * bus (PCI Express port) is ready for action.
300 */
301
302 struct pci_bus_d0_entry {
303 struct pci_message message_type;
304 u32 reserved;
305 u64 mmio_base;
306 } __packed;
307
308 struct pci_bus_relations {
309 struct pci_incoming_message incoming;
310 u32 device_count;
311 struct pci_function_description func[0];
312 } __packed;
313
314 struct pci_q_res_req_response {
315 struct vmpacket_descriptor hdr;
316 s32 status; /* negative values are failures */
317 u32 probed_bar[6];
318 } __packed;
319
320 struct pci_set_power {
321 struct pci_message message_type;
322 union win_slot_encoding wslot;
323 u32 power_state; /* In Windows terms */
324 u32 reserved;
325 } __packed;
326
327 struct pci_set_power_response {
328 struct vmpacket_descriptor hdr;
329 s32 status; /* negative values are failures */
330 union win_slot_encoding wslot;
331 u32 resultant_state; /* In Windows terms */
332 u32 reserved;
333 } __packed;
334
335 struct pci_resources_assigned {
336 struct pci_message message_type;
337 union win_slot_encoding wslot;
338 u8 memory_range[0x14][6]; /* not used here */
339 u32 msi_descriptors;
340 u32 reserved[4];
341 } __packed;
342
343 struct pci_resources_assigned2 {
344 struct pci_message message_type;
345 union win_slot_encoding wslot;
346 u8 memory_range[0x14][6]; /* not used here */
347 u32 msi_descriptor_count;
348 u8 reserved[70];
349 } __packed;
350
351 struct pci_create_interrupt {
352 struct pci_message message_type;
353 union win_slot_encoding wslot;
354 struct hv_msi_desc int_desc;
355 } __packed;
356
357 struct pci_create_int_response {
358 struct pci_response response;
359 u32 reserved;
360 struct tran_int_desc int_desc;
361 } __packed;
362
363 struct pci_create_interrupt2 {
364 struct pci_message message_type;
365 union win_slot_encoding wslot;
366 struct hv_msi_desc2 int_desc;
367 } __packed;
368
369 struct pci_delete_interrupt {
370 struct pci_message message_type;
371 union win_slot_encoding wslot;
372 struct tran_int_desc int_desc;
373 } __packed;
374
375 struct pci_dev_incoming {
376 struct pci_incoming_message incoming;
377 union win_slot_encoding wslot;
378 } __packed;
379
380 struct pci_eject_response {
381 struct pci_message message_type;
382 union win_slot_encoding wslot;
383 u32 status;
384 } __packed;
385
386 static int pci_ring_size = (4 * PAGE_SIZE);
387
388 /*
389 * Definitions or interrupt steering hypercall.
390 */
391 #define HV_PARTITION_ID_SELF ((u64)-1)
392 #define HVCALL_RETARGET_INTERRUPT 0x7e
393
394 struct hv_interrupt_entry {
395 u32 source; /* 1 for MSI(-X) */
396 u32 reserved1;
397 u32 address;
398 u32 data;
399 };
400
401 #define HV_VP_SET_BANK_COUNT_MAX 5 /* current implementation limit */
402
403 struct hv_vp_set {
404 u64 format; /* 0 (HvGenericSetSparse4k) */
405 u64 valid_banks;
406 u64 masks[HV_VP_SET_BANK_COUNT_MAX];
407 };
408
409 /*
410 * flags for hv_device_interrupt_target.flags
411 */
412 #define HV_DEVICE_INTERRUPT_TARGET_MULTICAST 1
413 #define HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET 2
414
415 struct hv_device_interrupt_target {
416 u32 vector;
417 u32 flags;
418 union {
419 u64 vp_mask;
420 struct hv_vp_set vp_set;
421 };
422 };
423
424 struct retarget_msi_interrupt {
425 u64 partition_id; /* use "self" */
426 u64 device_id;
427 struct hv_interrupt_entry int_entry;
428 u64 reserved2;
429 struct hv_device_interrupt_target int_target;
430 } __packed;
431
432 /*
433 * Driver specific state.
434 */
435
436 enum hv_pcibus_state {
437 hv_pcibus_init = 0,
438 hv_pcibus_probed,
439 hv_pcibus_installed,
440 hv_pcibus_removed,
441 hv_pcibus_maximum
442 };
443
444 struct hv_pcibus_device {
445 struct pci_sysdata sysdata;
446 enum hv_pcibus_state state;
447 atomic_t remove_lock;
448 struct hv_device *hdev;
449 resource_size_t low_mmio_space;
450 resource_size_t high_mmio_space;
451 struct resource *mem_config;
452 struct resource *low_mmio_res;
453 struct resource *high_mmio_res;
454 struct completion *survey_event;
455 struct completion remove_event;
456 struct pci_bus *pci_bus;
457 spinlock_t config_lock; /* Avoid two threads writing index page */
458 spinlock_t device_list_lock; /* Protect lists below */
459 void __iomem *cfg_addr;
460
461 struct list_head resources_for_children;
462
463 struct list_head children;
464 struct list_head dr_list;
465
466 struct msi_domain_info msi_info;
467 struct msi_controller msi_chip;
468 struct irq_domain *irq_domain;
469
470 /* hypercall arg, must not cross page boundary */
471 struct retarget_msi_interrupt retarget_msi_interrupt_params;
472
473 spinlock_t retarget_msi_interrupt_lock;
474
475 struct workqueue_struct *wq;
476 };
477
478 /*
479 * Tracks "Device Relations" messages from the host, which must be both
480 * processed in order and deferred so that they don't run in the context
481 * of the incoming packet callback.
482 */
483 struct hv_dr_work {
484 struct work_struct wrk;
485 struct hv_pcibus_device *bus;
486 };
487
488 struct hv_dr_state {
489 struct list_head list_entry;
490 u32 device_count;
491 struct pci_function_description func[0];
492 };
493
494 enum hv_pcichild_state {
495 hv_pcichild_init = 0,
496 hv_pcichild_requirements,
497 hv_pcichild_resourced,
498 hv_pcichild_ejecting,
499 hv_pcichild_maximum
500 };
501
502 enum hv_pcidev_ref_reason {
503 hv_pcidev_ref_invalid = 0,
504 hv_pcidev_ref_initial,
505 hv_pcidev_ref_by_slot,
506 hv_pcidev_ref_packet,
507 hv_pcidev_ref_pnp,
508 hv_pcidev_ref_childlist,
509 hv_pcidev_irqdata,
510 hv_pcidev_ref_max
511 };
512
513 struct hv_pci_dev {
514 /* List protected by pci_rescan_remove_lock */
515 struct list_head list_entry;
516 refcount_t refs;
517 enum hv_pcichild_state state;
518 struct pci_function_description desc;
519 bool reported_missing;
520 struct hv_pcibus_device *hbus;
521 struct work_struct wrk;
522
523 /*
524 * What would be observed if one wrote 0xFFFFFFFF to a BAR and then
525 * read it back, for each of the BAR offsets within config space.
526 */
527 u32 probed_bar[6];
528 };
529
530 struct hv_pci_compl {
531 struct completion host_event;
532 s32 completion_status;
533 };
534
535 static void hv_pci_onchannelcallback(void *context);
536
537 /**
538 * hv_pci_generic_compl() - Invoked for a completion packet
539 * @context: Set up by the sender of the packet.
540 * @resp: The response packet
541 * @resp_packet_size: Size in bytes of the packet
542 *
543 * This function is used to trigger an event and report status
544 * for any message for which the completion packet contains a
545 * status and nothing else.
546 */
547 static void hv_pci_generic_compl(void *context, struct pci_response *resp,
548 int resp_packet_size)
549 {
550 struct hv_pci_compl *comp_pkt = context;
551
552 if (resp_packet_size >= offsetofend(struct pci_response, status))
553 comp_pkt->completion_status = resp->status;
554 else
555 comp_pkt->completion_status = -1;
556
557 complete(&comp_pkt->host_event);
558 }
559
560 static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus,
561 u32 wslot);
562 static void get_pcichild(struct hv_pci_dev *hv_pcidev,
563 enum hv_pcidev_ref_reason reason);
564 static void put_pcichild(struct hv_pci_dev *hv_pcidev,
565 enum hv_pcidev_ref_reason reason);
566
567 static void get_hvpcibus(struct hv_pcibus_device *hv_pcibus);
568 static void put_hvpcibus(struct hv_pcibus_device *hv_pcibus);
569
570 /*
571 * There is no good way to get notified from vmbus_onoffer_rescind(),
572 * so let's use polling here, since this is not a hot path.
573 */
574 static int wait_for_response(struct hv_device *hdev,
575 struct completion *comp)
576 {
577 while (true) {
578 if (hdev->channel->rescind) {
579 dev_warn_once(&hdev->device, "The device is gone.\n");
580 return -ENODEV;
581 }
582
583 if (wait_for_completion_timeout(comp, HZ / 10))
584 break;
585 }
586
587 return 0;
588 }
589
590 /**
591 * devfn_to_wslot() - Convert from Linux PCI slot to Windows
592 * @devfn: The Linux representation of PCI slot
593 *
594 * Windows uses a slightly different representation of PCI slot.
595 *
596 * Return: The Windows representation
597 */
598 static u32 devfn_to_wslot(int devfn)
599 {
600 union win_slot_encoding wslot;
601
602 wslot.slot = 0;
603 wslot.bits.dev = PCI_SLOT(devfn);
604 wslot.bits.func = PCI_FUNC(devfn);
605
606 return wslot.slot;
607 }
608
609 /**
610 * wslot_to_devfn() - Convert from Windows PCI slot to Linux
611 * @wslot: The Windows representation of PCI slot
612 *
613 * Windows uses a slightly different representation of PCI slot.
614 *
615 * Return: The Linux representation
616 */
617 static int wslot_to_devfn(u32 wslot)
618 {
619 union win_slot_encoding slot_no;
620
621 slot_no.slot = wslot;
622 return PCI_DEVFN(slot_no.bits.dev, slot_no.bits.func);
623 }
624
625 /*
626 * PCI Configuration Space for these root PCI buses is implemented as a pair
627 * of pages in memory-mapped I/O space. Writing to the first page chooses
628 * the PCI function being written or read. Once the first page has been
629 * written to, the following page maps in the entire configuration space of
630 * the function.
631 */
632
633 /**
634 * _hv_pcifront_read_config() - Internal PCI config read
635 * @hpdev: The PCI driver's representation of the device
636 * @where: Offset within config space
637 * @size: Size of the transfer
638 * @val: Pointer to the buffer receiving the data
639 */
640 static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where,
641 int size, u32 *val)
642 {
643 unsigned long flags;
644 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
645
646 /*
647 * If the attempt is to read the IDs or the ROM BAR, simulate that.
648 */
649 if (where + size <= PCI_COMMAND) {
650 memcpy(val, ((u8 *)&hpdev->desc.v_id) + where, size);
651 } else if (where >= PCI_CLASS_REVISION && where + size <=
652 PCI_CACHE_LINE_SIZE) {
653 memcpy(val, ((u8 *)&hpdev->desc.rev) + where -
654 PCI_CLASS_REVISION, size);
655 } else if (where >= PCI_SUBSYSTEM_VENDOR_ID && where + size <=
656 PCI_ROM_ADDRESS) {
657 memcpy(val, (u8 *)&hpdev->desc.subsystem_id + where -
658 PCI_SUBSYSTEM_VENDOR_ID, size);
659 } else if (where >= PCI_ROM_ADDRESS && where + size <=
660 PCI_CAPABILITY_LIST) {
661 /* ROM BARs are unimplemented */
662 *val = 0;
663 } else if (where >= PCI_INTERRUPT_LINE && where + size <=
664 PCI_INTERRUPT_PIN) {
665 /*
666 * Interrupt Line and Interrupt PIN are hard-wired to zero
667 * because this front-end only supports message-signaled
668 * interrupts.
669 */
670 *val = 0;
671 } else if (where + size <= CFG_PAGE_SIZE) {
672 spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
673 /* Choose the function to be read. (See comment above) */
674 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
675 /* Make sure the function was chosen before we start reading. */
676 mb();
677 /* Read from that function's config space. */
678 switch (size) {
679 case 1:
680 *val = readb(addr);
681 break;
682 case 2:
683 *val = readw(addr);
684 break;
685 default:
686 *val = readl(addr);
687 break;
688 }
689 /*
690 * Make sure the read was done before we release the spinlock
691 * allowing consecutive reads/writes.
692 */
693 mb();
694 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
695 } else {
696 dev_err(&hpdev->hbus->hdev->device,
697 "Attempt to read beyond a function's config space.\n");
698 }
699 }
700
701 static u16 hv_pcifront_get_vendor_id(struct hv_pci_dev *hpdev)
702 {
703 u16 ret;
704 unsigned long flags;
705 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET +
706 PCI_VENDOR_ID;
707
708 spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
709
710 /* Choose the function to be read. (See comment above) */
711 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
712 /* Make sure the function was chosen before we start reading. */
713 mb();
714 /* Read from that function's config space. */
715 ret = readw(addr);
716 /*
717 * mb() is not required here, because the spin_unlock_irqrestore()
718 * is a barrier.
719 */
720
721 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
722
723 return ret;
724 }
725
726 /**
727 * _hv_pcifront_write_config() - Internal PCI config write
728 * @hpdev: The PCI driver's representation of the device
729 * @where: Offset within config space
730 * @size: Size of the transfer
731 * @val: The data being transferred
732 */
733 static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where,
734 int size, u32 val)
735 {
736 unsigned long flags;
737 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
738
739 if (where >= PCI_SUBSYSTEM_VENDOR_ID &&
740 where + size <= PCI_CAPABILITY_LIST) {
741 /* SSIDs and ROM BARs are read-only */
742 } else if (where >= PCI_COMMAND && where + size <= CFG_PAGE_SIZE) {
743 spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
744 /* Choose the function to be written. (See comment above) */
745 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
746 /* Make sure the function was chosen before we start writing. */
747 wmb();
748 /* Write to that function's config space. */
749 switch (size) {
750 case 1:
751 writeb(val, addr);
752 break;
753 case 2:
754 writew(val, addr);
755 break;
756 default:
757 writel(val, addr);
758 break;
759 }
760 /*
761 * Make sure the write was done before we release the spinlock
762 * allowing consecutive reads/writes.
763 */
764 mb();
765 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
766 } else {
767 dev_err(&hpdev->hbus->hdev->device,
768 "Attempt to write beyond a function's config space.\n");
769 }
770 }
771
772 /**
773 * hv_pcifront_read_config() - Read configuration space
774 * @bus: PCI Bus structure
775 * @devfn: Device/function
776 * @where: Offset from base
777 * @size: Byte/word/dword
778 * @val: Value to be read
779 *
780 * Return: PCIBIOS_SUCCESSFUL on success
781 * PCIBIOS_DEVICE_NOT_FOUND on failure
782 */
783 static int hv_pcifront_read_config(struct pci_bus *bus, unsigned int devfn,
784 int where, int size, u32 *val)
785 {
786 struct hv_pcibus_device *hbus =
787 container_of(bus->sysdata, struct hv_pcibus_device, sysdata);
788 struct hv_pci_dev *hpdev;
789
790 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn));
791 if (!hpdev)
792 return PCIBIOS_DEVICE_NOT_FOUND;
793
794 _hv_pcifront_read_config(hpdev, where, size, val);
795
796 put_pcichild(hpdev, hv_pcidev_ref_by_slot);
797 return PCIBIOS_SUCCESSFUL;
798 }
799
800 /**
801 * hv_pcifront_write_config() - Write configuration space
802 * @bus: PCI Bus structure
803 * @devfn: Device/function
804 * @where: Offset from base
805 * @size: Byte/word/dword
806 * @val: Value to be written to device
807 *
808 * Return: PCIBIOS_SUCCESSFUL on success
809 * PCIBIOS_DEVICE_NOT_FOUND on failure
810 */
811 static int hv_pcifront_write_config(struct pci_bus *bus, unsigned int devfn,
812 int where, int size, u32 val)
813 {
814 struct hv_pcibus_device *hbus =
815 container_of(bus->sysdata, struct hv_pcibus_device, sysdata);
816 struct hv_pci_dev *hpdev;
817
818 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn));
819 if (!hpdev)
820 return PCIBIOS_DEVICE_NOT_FOUND;
821
822 _hv_pcifront_write_config(hpdev, where, size, val);
823
824 put_pcichild(hpdev, hv_pcidev_ref_by_slot);
825 return PCIBIOS_SUCCESSFUL;
826 }
827
828 /* PCIe operations */
829 static struct pci_ops hv_pcifront_ops = {
830 .read = hv_pcifront_read_config,
831 .write = hv_pcifront_write_config,
832 };
833
834 /* Interrupt management hooks */
835 static void hv_int_desc_free(struct hv_pci_dev *hpdev,
836 struct tran_int_desc *int_desc)
837 {
838 struct pci_delete_interrupt *int_pkt;
839 struct {
840 struct pci_packet pkt;
841 u8 buffer[sizeof(struct pci_delete_interrupt)];
842 } ctxt;
843
844 memset(&ctxt, 0, sizeof(ctxt));
845 int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message;
846 int_pkt->message_type.type =
847 PCI_DELETE_INTERRUPT_MESSAGE;
848 int_pkt->wslot.slot = hpdev->desc.win_slot.slot;
849 int_pkt->int_desc = *int_desc;
850 vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt, sizeof(*int_pkt),
851 (unsigned long)&ctxt.pkt, VM_PKT_DATA_INBAND, 0);
852 kfree(int_desc);
853 }
854
855 /**
856 * hv_msi_free() - Free the MSI.
857 * @domain: The interrupt domain pointer
858 * @info: Extra MSI-related context
859 * @irq: Identifies the IRQ.
860 *
861 * The Hyper-V parent partition and hypervisor are tracking the
862 * messages that are in use, keeping the interrupt redirection
863 * table up to date. This callback sends a message that frees
864 * the IRT entry and related tracking nonsense.
865 */
866 static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info,
867 unsigned int irq)
868 {
869 struct hv_pcibus_device *hbus;
870 struct hv_pci_dev *hpdev;
871 struct pci_dev *pdev;
872 struct tran_int_desc *int_desc;
873 struct irq_data *irq_data = irq_domain_get_irq_data(domain, irq);
874 struct msi_desc *msi = irq_data_get_msi_desc(irq_data);
875
876 pdev = msi_desc_to_pci_dev(msi);
877 hbus = info->data;
878 int_desc = irq_data_get_irq_chip_data(irq_data);
879 if (!int_desc)
880 return;
881
882 irq_data->chip_data = NULL;
883 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
884 if (!hpdev) {
885 kfree(int_desc);
886 return;
887 }
888
889 hv_int_desc_free(hpdev, int_desc);
890 put_pcichild(hpdev, hv_pcidev_ref_by_slot);
891 }
892
893 static int hv_set_affinity(struct irq_data *data, const struct cpumask *dest,
894 bool force)
895 {
896 struct irq_data *parent = data->parent_data;
897
898 return parent->chip->irq_set_affinity(parent, dest, force);
899 }
900
901 static void hv_irq_mask(struct irq_data *data)
902 {
903 pci_msi_mask_irq(data);
904 }
905
906 /**
907 * hv_irq_unmask() - "Unmask" the IRQ by setting its current
908 * affinity.
909 * @data: Describes the IRQ
910 *
911 * Build new a destination for the MSI and make a hypercall to
912 * update the Interrupt Redirection Table. "Device Logical ID"
913 * is built out of this PCI bus's instance GUID and the function
914 * number of the device.
915 */
916 static void hv_irq_unmask(struct irq_data *data)
917 {
918 struct msi_desc *msi_desc = irq_data_get_msi_desc(data);
919 struct irq_cfg *cfg = irqd_cfg(data);
920 struct retarget_msi_interrupt *params;
921 struct hv_pcibus_device *hbus;
922 struct cpumask *dest;
923 struct pci_bus *pbus;
924 struct pci_dev *pdev;
925 unsigned long flags;
926 u32 var_size = 0;
927 int cpu_vmbus;
928 int cpu;
929 u64 res;
930
931 dest = irq_data_get_effective_affinity_mask(data);
932 pdev = msi_desc_to_pci_dev(msi_desc);
933 pbus = pdev->bus;
934 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
935
936 spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags);
937
938 params = &hbus->retarget_msi_interrupt_params;
939 memset(params, 0, sizeof(*params));
940 params->partition_id = HV_PARTITION_ID_SELF;
941 params->int_entry.source = 1; /* MSI(-X) */
942 params->int_entry.address = msi_desc->msg.address_lo;
943 params->int_entry.data = msi_desc->msg.data;
944 params->device_id = (hbus->hdev->dev_instance.b[5] << 24) |
945 (hbus->hdev->dev_instance.b[4] << 16) |
946 (hbus->hdev->dev_instance.b[7] << 8) |
947 (hbus->hdev->dev_instance.b[6] & 0xf8) |
948 PCI_FUNC(pdev->devfn);
949 params->int_target.vector = cfg->vector;
950
951 /*
952 * Honoring apic->irq_delivery_mode set to dest_Fixed by
953 * setting the HV_DEVICE_INTERRUPT_TARGET_MULTICAST flag results in a
954 * spurious interrupt storm. Not doing so does not seem to have a
955 * negative effect (yet?).
956 */
957
958 if (pci_protocol_version >= PCI_PROTOCOL_VERSION_1_2) {
959 /*
960 * PCI_PROTOCOL_VERSION_1_2 supports the VP_SET version of the
961 * HVCALL_RETARGET_INTERRUPT hypercall, which also coincides
962 * with >64 VP support.
963 * ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED
964 * is not sufficient for this hypercall.
965 */
966 params->int_target.flags |=
967 HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET;
968 params->int_target.vp_set.valid_banks =
969 (1ull << HV_VP_SET_BANK_COUNT_MAX) - 1;
970
971 /*
972 * var-sized hypercall, var-size starts after vp_mask (thus
973 * vp_set.format does not count, but vp_set.valid_banks does).
974 */
975 var_size = 1 + HV_VP_SET_BANK_COUNT_MAX;
976
977 for_each_cpu_and(cpu, dest, cpu_online_mask) {
978 cpu_vmbus = hv_cpu_number_to_vp_number(cpu);
979
980 if (cpu_vmbus >= HV_VP_SET_BANK_COUNT_MAX * 64) {
981 dev_err(&hbus->hdev->device,
982 "too high CPU %d", cpu_vmbus);
983 res = 1;
984 goto exit_unlock;
985 }
986
987 params->int_target.vp_set.masks[cpu_vmbus / 64] |=
988 (1ULL << (cpu_vmbus & 63));
989 }
990 } else {
991 for_each_cpu_and(cpu, dest, cpu_online_mask) {
992 params->int_target.vp_mask |=
993 (1ULL << hv_cpu_number_to_vp_number(cpu));
994 }
995 }
996
997 res = hv_do_hypercall(HVCALL_RETARGET_INTERRUPT | (var_size << 17),
998 params, NULL);
999
1000 exit_unlock:
1001 spin_unlock_irqrestore(&hbus->retarget_msi_interrupt_lock, flags);
1002
1003 if (res) {
1004 dev_err(&hbus->hdev->device,
1005 "%s() failed: %#llx", __func__, res);
1006 return;
1007 }
1008
1009 pci_msi_unmask_irq(data);
1010 }
1011
1012 struct compose_comp_ctxt {
1013 struct hv_pci_compl comp_pkt;
1014 struct tran_int_desc int_desc;
1015 };
1016
1017 static void hv_pci_compose_compl(void *context, struct pci_response *resp,
1018 int resp_packet_size)
1019 {
1020 struct compose_comp_ctxt *comp_pkt = context;
1021 struct pci_create_int_response *int_resp =
1022 (struct pci_create_int_response *)resp;
1023
1024 comp_pkt->comp_pkt.completion_status = resp->status;
1025 comp_pkt->int_desc = int_resp->int_desc;
1026 complete(&comp_pkt->comp_pkt.host_event);
1027 }
1028
1029 static u32 hv_compose_msi_req_v1(
1030 struct pci_create_interrupt *int_pkt, struct cpumask *affinity,
1031 u32 slot, u8 vector)
1032 {
1033 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE;
1034 int_pkt->wslot.slot = slot;
1035 int_pkt->int_desc.vector = vector;
1036 int_pkt->int_desc.vector_count = 1;
1037 int_pkt->int_desc.delivery_mode = dest_Fixed;
1038
1039 /*
1040 * Create MSI w/ dummy vCPU set, overwritten by subsequent retarget in
1041 * hv_irq_unmask().
1042 */
1043 int_pkt->int_desc.cpu_mask = CPU_AFFINITY_ALL;
1044
1045 return sizeof(*int_pkt);
1046 }
1047
1048 static u32 hv_compose_msi_req_v2(
1049 struct pci_create_interrupt2 *int_pkt, struct cpumask *affinity,
1050 u32 slot, u8 vector)
1051 {
1052 int cpu;
1053
1054 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2;
1055 int_pkt->wslot.slot = slot;
1056 int_pkt->int_desc.vector = vector;
1057 int_pkt->int_desc.vector_count = 1;
1058 int_pkt->int_desc.delivery_mode = dest_Fixed;
1059
1060 /*
1061 * Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten
1062 * by subsequent retarget in hv_irq_unmask().
1063 */
1064 cpu = cpumask_first_and(affinity, cpu_online_mask);
1065 int_pkt->int_desc.processor_array[0] =
1066 hv_cpu_number_to_vp_number(cpu);
1067 int_pkt->int_desc.processor_count = 1;
1068
1069 return sizeof(*int_pkt);
1070 }
1071
1072 /**
1073 * hv_compose_msi_msg() - Supplies a valid MSI address/data
1074 * @data: Everything about this MSI
1075 * @msg: Buffer that is filled in by this function
1076 *
1077 * This function unpacks the IRQ looking for target CPU set, IDT
1078 * vector and mode and sends a message to the parent partition
1079 * asking for a mapping for that tuple in this partition. The
1080 * response supplies a data value and address to which that data
1081 * should be written to trigger that interrupt.
1082 */
1083 static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
1084 {
1085 struct irq_cfg *cfg = irqd_cfg(data);
1086 struct hv_pcibus_device *hbus;
1087 struct hv_pci_dev *hpdev;
1088 struct pci_bus *pbus;
1089 struct pci_dev *pdev;
1090 struct cpumask *dest;
1091 struct compose_comp_ctxt comp;
1092 struct tran_int_desc *int_desc;
1093 struct {
1094 struct pci_packet pci_pkt;
1095 union {
1096 struct pci_create_interrupt v1;
1097 struct pci_create_interrupt2 v2;
1098 } int_pkts;
1099 } __packed ctxt;
1100
1101 u32 size;
1102 int ret;
1103
1104 pdev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data));
1105 dest = irq_data_get_effective_affinity_mask(data);
1106 pbus = pdev->bus;
1107 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
1108 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
1109 if (!hpdev)
1110 goto return_null_message;
1111
1112 /* Free any previous message that might have already been composed. */
1113 if (data->chip_data) {
1114 int_desc = data->chip_data;
1115 data->chip_data = NULL;
1116 hv_int_desc_free(hpdev, int_desc);
1117 }
1118
1119 int_desc = kzalloc(sizeof(*int_desc), GFP_ATOMIC);
1120 if (!int_desc)
1121 goto drop_reference;
1122
1123 memset(&ctxt, 0, sizeof(ctxt));
1124 init_completion(&comp.comp_pkt.host_event);
1125 ctxt.pci_pkt.completion_func = hv_pci_compose_compl;
1126 ctxt.pci_pkt.compl_ctxt = &comp;
1127
1128 switch (pci_protocol_version) {
1129 case PCI_PROTOCOL_VERSION_1_1:
1130 size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1,
1131 dest,
1132 hpdev->desc.win_slot.slot,
1133 cfg->vector);
1134 break;
1135
1136 case PCI_PROTOCOL_VERSION_1_2:
1137 size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2,
1138 dest,
1139 hpdev->desc.win_slot.slot,
1140 cfg->vector);
1141 break;
1142
1143 default:
1144 /* As we only negotiate protocol versions known to this driver,
1145 * this path should never hit. However, this is it not a hot
1146 * path so we print a message to aid future updates.
1147 */
1148 dev_err(&hbus->hdev->device,
1149 "Unexpected vPCI protocol, update driver.");
1150 goto free_int_desc;
1151 }
1152
1153 ret = vmbus_sendpacket(hpdev->hbus->hdev->channel, &ctxt.int_pkts,
1154 size, (unsigned long)&ctxt.pci_pkt,
1155 VM_PKT_DATA_INBAND,
1156 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
1157 if (ret) {
1158 dev_err(&hbus->hdev->device,
1159 "Sending request for interrupt failed: 0x%x",
1160 comp.comp_pkt.completion_status);
1161 goto free_int_desc;
1162 }
1163
1164 /*
1165 * Since this function is called with IRQ locks held, can't
1166 * do normal wait for completion; instead poll.
1167 */
1168 while (!try_wait_for_completion(&comp.comp_pkt.host_event)) {
1169 /* 0xFFFF means an invalid PCI VENDOR ID. */
1170 if (hv_pcifront_get_vendor_id(hpdev) == 0xFFFF) {
1171 dev_err_once(&hbus->hdev->device,
1172 "the device has gone\n");
1173 goto free_int_desc;
1174 }
1175
1176 /*
1177 * When the higher level interrupt code calls us with
1178 * interrupt disabled, we must poll the channel by calling
1179 * the channel callback directly when channel->target_cpu is
1180 * the current CPU. When the higher level interrupt code
1181 * calls us with interrupt enabled, let's add the
1182 * local_bh_disable()/enable() to avoid race.
1183 */
1184 local_bh_disable();
1185
1186 if (hbus->hdev->channel->target_cpu == smp_processor_id())
1187 hv_pci_onchannelcallback(hbus);
1188
1189 local_bh_enable();
1190
1191 if (hpdev->state == hv_pcichild_ejecting) {
1192 dev_err_once(&hbus->hdev->device,
1193 "the device is being ejected\n");
1194 goto free_int_desc;
1195 }
1196
1197 udelay(100);
1198 }
1199
1200 if (comp.comp_pkt.completion_status < 0) {
1201 dev_err(&hbus->hdev->device,
1202 "Request for interrupt failed: 0x%x",
1203 comp.comp_pkt.completion_status);
1204 goto free_int_desc;
1205 }
1206
1207 /*
1208 * Record the assignment so that this can be unwound later. Using
1209 * irq_set_chip_data() here would be appropriate, but the lock it takes
1210 * is already held.
1211 */
1212 *int_desc = comp.int_desc;
1213 data->chip_data = int_desc;
1214
1215 /* Pass up the result. */
1216 msg->address_hi = comp.int_desc.address >> 32;
1217 msg->address_lo = comp.int_desc.address & 0xffffffff;
1218 msg->data = comp.int_desc.data;
1219
1220 put_pcichild(hpdev, hv_pcidev_ref_by_slot);
1221 return;
1222
1223 free_int_desc:
1224 kfree(int_desc);
1225 drop_reference:
1226 put_pcichild(hpdev, hv_pcidev_ref_by_slot);
1227 return_null_message:
1228 msg->address_hi = 0;
1229 msg->address_lo = 0;
1230 msg->data = 0;
1231 }
1232
1233 /* HW Interrupt Chip Descriptor */
1234 static struct irq_chip hv_msi_irq_chip = {
1235 .name = "Hyper-V PCIe MSI",
1236 .irq_compose_msi_msg = hv_compose_msi_msg,
1237 .irq_set_affinity = hv_set_affinity,
1238 .irq_ack = irq_chip_ack_parent,
1239 .irq_mask = hv_irq_mask,
1240 .irq_unmask = hv_irq_unmask,
1241 };
1242
1243 static irq_hw_number_t hv_msi_domain_ops_get_hwirq(struct msi_domain_info *info,
1244 msi_alloc_info_t *arg)
1245 {
1246 return arg->msi_hwirq;
1247 }
1248
1249 static struct msi_domain_ops hv_msi_ops = {
1250 .get_hwirq = hv_msi_domain_ops_get_hwirq,
1251 .msi_prepare = pci_msi_prepare,
1252 .set_desc = pci_msi_set_desc,
1253 .msi_free = hv_msi_free,
1254 };
1255
1256 /**
1257 * hv_pcie_init_irq_domain() - Initialize IRQ domain
1258 * @hbus: The root PCI bus
1259 *
1260 * This function creates an IRQ domain which will be used for
1261 * interrupts from devices that have been passed through. These
1262 * devices only support MSI and MSI-X, not line-based interrupts
1263 * or simulations of line-based interrupts through PCIe's
1264 * fabric-layer messages. Because interrupts are remapped, we
1265 * can support multi-message MSI here.
1266 *
1267 * Return: '0' on success and error value on failure
1268 */
1269 static int hv_pcie_init_irq_domain(struct hv_pcibus_device *hbus)
1270 {
1271 hbus->msi_info.chip = &hv_msi_irq_chip;
1272 hbus->msi_info.ops = &hv_msi_ops;
1273 hbus->msi_info.flags = (MSI_FLAG_USE_DEF_DOM_OPS |
1274 MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI |
1275 MSI_FLAG_PCI_MSIX);
1276 hbus->msi_info.handler = handle_edge_irq;
1277 hbus->msi_info.handler_name = "edge";
1278 hbus->msi_info.data = hbus;
1279 hbus->irq_domain = pci_msi_create_irq_domain(hbus->sysdata.fwnode,
1280 &hbus->msi_info,
1281 x86_vector_domain);
1282 if (!hbus->irq_domain) {
1283 dev_err(&hbus->hdev->device,
1284 "Failed to build an MSI IRQ domain\n");
1285 return -ENODEV;
1286 }
1287
1288 return 0;
1289 }
1290
1291 /**
1292 * get_bar_size() - Get the address space consumed by a BAR
1293 * @bar_val: Value that a BAR returned after -1 was written
1294 * to it.
1295 *
1296 * This function returns the size of the BAR, rounded up to 1
1297 * page. It has to be rounded up because the hypervisor's page
1298 * table entry that maps the BAR into the VM can't specify an
1299 * offset within a page. The invariant is that the hypervisor
1300 * must place any BARs of smaller than page length at the
1301 * beginning of a page.
1302 *
1303 * Return: Size in bytes of the consumed MMIO space.
1304 */
1305 static u64 get_bar_size(u64 bar_val)
1306 {
1307 return round_up((1 + ~(bar_val & PCI_BASE_ADDRESS_MEM_MASK)),
1308 PAGE_SIZE);
1309 }
1310
1311 /**
1312 * survey_child_resources() - Total all MMIO requirements
1313 * @hbus: Root PCI bus, as understood by this driver
1314 */
1315 static void survey_child_resources(struct hv_pcibus_device *hbus)
1316 {
1317 struct list_head *iter;
1318 struct hv_pci_dev *hpdev;
1319 resource_size_t bar_size = 0;
1320 unsigned long flags;
1321 struct completion *event;
1322 u64 bar_val;
1323 int i;
1324
1325 /* If nobody is waiting on the answer, don't compute it. */
1326 event = xchg(&hbus->survey_event, NULL);
1327 if (!event)
1328 return;
1329
1330 /* If the answer has already been computed, go with it. */
1331 if (hbus->low_mmio_space || hbus->high_mmio_space) {
1332 complete(event);
1333 return;
1334 }
1335
1336 spin_lock_irqsave(&hbus->device_list_lock, flags);
1337
1338 /*
1339 * Due to an interesting quirk of the PCI spec, all memory regions
1340 * for a child device are a power of 2 in size and aligned in memory,
1341 * so it's sufficient to just add them up without tracking alignment.
1342 */
1343 list_for_each(iter, &hbus->children) {
1344 hpdev = container_of(iter, struct hv_pci_dev, list_entry);
1345 for (i = 0; i < 6; i++) {
1346 if (hpdev->probed_bar[i] & PCI_BASE_ADDRESS_SPACE_IO)
1347 dev_err(&hbus->hdev->device,
1348 "There's an I/O BAR in this list!\n");
1349
1350 if (hpdev->probed_bar[i] != 0) {
1351 /*
1352 * A probed BAR has all the upper bits set that
1353 * can be changed.
1354 */
1355
1356 bar_val = hpdev->probed_bar[i];
1357 if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64)
1358 bar_val |=
1359 ((u64)hpdev->probed_bar[++i] << 32);
1360 else
1361 bar_val |= 0xffffffff00000000ULL;
1362
1363 bar_size = get_bar_size(bar_val);
1364
1365 if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64)
1366 hbus->high_mmio_space += bar_size;
1367 else
1368 hbus->low_mmio_space += bar_size;
1369 }
1370 }
1371 }
1372
1373 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1374 complete(event);
1375 }
1376
1377 /**
1378 * prepopulate_bars() - Fill in BARs with defaults
1379 * @hbus: Root PCI bus, as understood by this driver
1380 *
1381 * The core PCI driver code seems much, much happier if the BARs
1382 * for a device have values upon first scan. So fill them in.
1383 * The algorithm below works down from large sizes to small,
1384 * attempting to pack the assignments optimally. The assumption,
1385 * enforced in other parts of the code, is that the beginning of
1386 * the memory-mapped I/O space will be aligned on the largest
1387 * BAR size.
1388 */
1389 static void prepopulate_bars(struct hv_pcibus_device *hbus)
1390 {
1391 resource_size_t high_size = 0;
1392 resource_size_t low_size = 0;
1393 resource_size_t high_base = 0;
1394 resource_size_t low_base = 0;
1395 resource_size_t bar_size;
1396 struct hv_pci_dev *hpdev;
1397 struct list_head *iter;
1398 unsigned long flags;
1399 u64 bar_val;
1400 u32 command;
1401 bool high;
1402 int i;
1403
1404 if (hbus->low_mmio_space) {
1405 low_size = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space));
1406 low_base = hbus->low_mmio_res->start;
1407 }
1408
1409 if (hbus->high_mmio_space) {
1410 high_size = 1ULL <<
1411 (63 - __builtin_clzll(hbus->high_mmio_space));
1412 high_base = hbus->high_mmio_res->start;
1413 }
1414
1415 spin_lock_irqsave(&hbus->device_list_lock, flags);
1416
1417 /* Pick addresses for the BARs. */
1418 do {
1419 list_for_each(iter, &hbus->children) {
1420 hpdev = container_of(iter, struct hv_pci_dev,
1421 list_entry);
1422 for (i = 0; i < 6; i++) {
1423 bar_val = hpdev->probed_bar[i];
1424 if (bar_val == 0)
1425 continue;
1426 high = bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64;
1427 if (high) {
1428 bar_val |=
1429 ((u64)hpdev->probed_bar[i + 1]
1430 << 32);
1431 } else {
1432 bar_val |= 0xffffffffULL << 32;
1433 }
1434 bar_size = get_bar_size(bar_val);
1435 if (high) {
1436 if (high_size != bar_size) {
1437 i++;
1438 continue;
1439 }
1440 _hv_pcifront_write_config(hpdev,
1441 PCI_BASE_ADDRESS_0 + (4 * i),
1442 4,
1443 (u32)(high_base & 0xffffff00));
1444 i++;
1445 _hv_pcifront_write_config(hpdev,
1446 PCI_BASE_ADDRESS_0 + (4 * i),
1447 4, (u32)(high_base >> 32));
1448 high_base += bar_size;
1449 } else {
1450 if (low_size != bar_size)
1451 continue;
1452 _hv_pcifront_write_config(hpdev,
1453 PCI_BASE_ADDRESS_0 + (4 * i),
1454 4,
1455 (u32)(low_base & 0xffffff00));
1456 low_base += bar_size;
1457 }
1458 }
1459 if (high_size <= 1 && low_size <= 1) {
1460 /* Set the memory enable bit. */
1461 _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2,
1462 &command);
1463 command |= PCI_COMMAND_MEMORY;
1464 _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2,
1465 command);
1466 break;
1467 }
1468 }
1469
1470 high_size >>= 1;
1471 low_size >>= 1;
1472 } while (high_size || low_size);
1473
1474 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1475 }
1476
1477 /**
1478 * create_root_hv_pci_bus() - Expose a new root PCI bus
1479 * @hbus: Root PCI bus, as understood by this driver
1480 *
1481 * Return: 0 on success, -errno on failure
1482 */
1483 static int create_root_hv_pci_bus(struct hv_pcibus_device *hbus)
1484 {
1485 /* Register the device */
1486 hbus->pci_bus = pci_create_root_bus(&hbus->hdev->device,
1487 0, /* bus number is always zero */
1488 &hv_pcifront_ops,
1489 &hbus->sysdata,
1490 &hbus->resources_for_children);
1491 if (!hbus->pci_bus)
1492 return -ENODEV;
1493
1494 hbus->pci_bus->msi = &hbus->msi_chip;
1495 hbus->pci_bus->msi->dev = &hbus->hdev->device;
1496
1497 pci_lock_rescan_remove();
1498 pci_scan_child_bus(hbus->pci_bus);
1499 pci_bus_assign_resources(hbus->pci_bus);
1500 pci_bus_add_devices(hbus->pci_bus);
1501 pci_unlock_rescan_remove();
1502 hbus->state = hv_pcibus_installed;
1503 return 0;
1504 }
1505
1506 struct q_res_req_compl {
1507 struct completion host_event;
1508 struct hv_pci_dev *hpdev;
1509 };
1510
1511 /**
1512 * q_resource_requirements() - Query Resource Requirements
1513 * @context: The completion context.
1514 * @resp: The response that came from the host.
1515 * @resp_packet_size: The size in bytes of resp.
1516 *
1517 * This function is invoked on completion of a Query Resource
1518 * Requirements packet.
1519 */
1520 static void q_resource_requirements(void *context, struct pci_response *resp,
1521 int resp_packet_size)
1522 {
1523 struct q_res_req_compl *completion = context;
1524 struct pci_q_res_req_response *q_res_req =
1525 (struct pci_q_res_req_response *)resp;
1526 int i;
1527
1528 if (resp->status < 0) {
1529 dev_err(&completion->hpdev->hbus->hdev->device,
1530 "query resource requirements failed: %x\n",
1531 resp->status);
1532 } else {
1533 for (i = 0; i < 6; i++) {
1534 completion->hpdev->probed_bar[i] =
1535 q_res_req->probed_bar[i];
1536 }
1537 }
1538
1539 complete(&completion->host_event);
1540 }
1541
1542 static void get_pcichild(struct hv_pci_dev *hpdev,
1543 enum hv_pcidev_ref_reason reason)
1544 {
1545 refcount_inc(&hpdev->refs);
1546 }
1547
1548 static void put_pcichild(struct hv_pci_dev *hpdev,
1549 enum hv_pcidev_ref_reason reason)
1550 {
1551 if (refcount_dec_and_test(&hpdev->refs))
1552 kfree(hpdev);
1553 }
1554
1555 /**
1556 * new_pcichild_device() - Create a new child device
1557 * @hbus: The internal struct tracking this root PCI bus.
1558 * @desc: The information supplied so far from the host
1559 * about the device.
1560 *
1561 * This function creates the tracking structure for a new child
1562 * device and kicks off the process of figuring out what it is.
1563 *
1564 * Return: Pointer to the new tracking struct
1565 */
1566 static struct hv_pci_dev *new_pcichild_device(struct hv_pcibus_device *hbus,
1567 struct pci_function_description *desc)
1568 {
1569 struct hv_pci_dev *hpdev;
1570 struct pci_child_message *res_req;
1571 struct q_res_req_compl comp_pkt;
1572 struct {
1573 struct pci_packet init_packet;
1574 u8 buffer[sizeof(struct pci_child_message)];
1575 } pkt;
1576 unsigned long flags;
1577 int ret;
1578
1579 hpdev = kzalloc(sizeof(*hpdev), GFP_ATOMIC);
1580 if (!hpdev)
1581 return NULL;
1582
1583 hpdev->hbus = hbus;
1584
1585 memset(&pkt, 0, sizeof(pkt));
1586 init_completion(&comp_pkt.host_event);
1587 comp_pkt.hpdev = hpdev;
1588 pkt.init_packet.compl_ctxt = &comp_pkt;
1589 pkt.init_packet.completion_func = q_resource_requirements;
1590 res_req = (struct pci_child_message *)&pkt.init_packet.message;
1591 res_req->message_type.type = PCI_QUERY_RESOURCE_REQUIREMENTS;
1592 res_req->wslot.slot = desc->win_slot.slot;
1593
1594 ret = vmbus_sendpacket(hbus->hdev->channel, res_req,
1595 sizeof(struct pci_child_message),
1596 (unsigned long)&pkt.init_packet,
1597 VM_PKT_DATA_INBAND,
1598 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
1599 if (ret)
1600 goto error;
1601
1602 if (wait_for_response(hbus->hdev, &comp_pkt.host_event))
1603 goto error;
1604
1605 hpdev->desc = *desc;
1606 refcount_set(&hpdev->refs, 1);
1607 get_pcichild(hpdev, hv_pcidev_ref_childlist);
1608 spin_lock_irqsave(&hbus->device_list_lock, flags);
1609
1610 /*
1611 * When a device is being added to the bus, we set the PCI domain
1612 * number to be the device serial number, which is non-zero and
1613 * unique on the same VM. The serial numbers start with 1, and
1614 * increase by 1 for each device. So device names including this
1615 * can have shorter names than based on the bus instance UUID.
1616 * Only the first device serial number is used for domain, so the
1617 * domain number will not change after the first device is added.
1618 * The lower 16 bits of the serial number is used, otherwise some
1619 * drivers may not be able to handle it.
1620 */
1621 if (list_empty(&hbus->children))
1622 hbus->sysdata.domain = desc->ser & 0xFFFF;
1623 list_add_tail(&hpdev->list_entry, &hbus->children);
1624 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1625 return hpdev;
1626
1627 error:
1628 kfree(hpdev);
1629 return NULL;
1630 }
1631
1632 /**
1633 * get_pcichild_wslot() - Find device from slot
1634 * @hbus: Root PCI bus, as understood by this driver
1635 * @wslot: Location on the bus
1636 *
1637 * This function looks up a PCI device and returns the internal
1638 * representation of it. It acquires a reference on it, so that
1639 * the device won't be deleted while somebody is using it. The
1640 * caller is responsible for calling put_pcichild() to release
1641 * this reference.
1642 *
1643 * Return: Internal representation of a PCI device
1644 */
1645 static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus,
1646 u32 wslot)
1647 {
1648 unsigned long flags;
1649 struct hv_pci_dev *iter, *hpdev = NULL;
1650
1651 spin_lock_irqsave(&hbus->device_list_lock, flags);
1652 list_for_each_entry(iter, &hbus->children, list_entry) {
1653 if (iter->desc.win_slot.slot == wslot) {
1654 hpdev = iter;
1655 get_pcichild(hpdev, hv_pcidev_ref_by_slot);
1656 break;
1657 }
1658 }
1659 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1660
1661 return hpdev;
1662 }
1663
1664 /**
1665 * pci_devices_present_work() - Handle new list of child devices
1666 * @work: Work struct embedded in struct hv_dr_work
1667 *
1668 * "Bus Relations" is the Windows term for "children of this
1669 * bus." The terminology is preserved here for people trying to
1670 * debug the interaction between Hyper-V and Linux. This
1671 * function is called when the parent partition reports a list
1672 * of functions that should be observed under this PCI Express
1673 * port (bus).
1674 *
1675 * This function updates the list, and must tolerate being
1676 * called multiple times with the same information. The typical
1677 * number of child devices is one, with very atypical cases
1678 * involving three or four, so the algorithms used here can be
1679 * simple and inefficient.
1680 *
1681 * It must also treat the omission of a previously observed device as
1682 * notification that the device no longer exists.
1683 *
1684 * Note that this function is serialized with hv_eject_device_work(),
1685 * because both are pushed to the ordered workqueue hbus->wq.
1686 */
1687 static void pci_devices_present_work(struct work_struct *work)
1688 {
1689 u32 child_no;
1690 bool found;
1691 struct list_head *iter;
1692 struct pci_function_description *new_desc;
1693 struct hv_pci_dev *hpdev;
1694 struct hv_pcibus_device *hbus;
1695 struct list_head removed;
1696 struct hv_dr_work *dr_wrk;
1697 struct hv_dr_state *dr = NULL;
1698 unsigned long flags;
1699
1700 dr_wrk = container_of(work, struct hv_dr_work, wrk);
1701 hbus = dr_wrk->bus;
1702 kfree(dr_wrk);
1703
1704 INIT_LIST_HEAD(&removed);
1705
1706 /* Pull this off the queue and process it if it was the last one. */
1707 spin_lock_irqsave(&hbus->device_list_lock, flags);
1708 while (!list_empty(&hbus->dr_list)) {
1709 dr = list_first_entry(&hbus->dr_list, struct hv_dr_state,
1710 list_entry);
1711 list_del(&dr->list_entry);
1712
1713 /* Throw this away if the list still has stuff in it. */
1714 if (!list_empty(&hbus->dr_list)) {
1715 kfree(dr);
1716 continue;
1717 }
1718 }
1719 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1720
1721 if (!dr) {
1722 put_hvpcibus(hbus);
1723 return;
1724 }
1725
1726 /* First, mark all existing children as reported missing. */
1727 spin_lock_irqsave(&hbus->device_list_lock, flags);
1728 list_for_each(iter, &hbus->children) {
1729 hpdev = container_of(iter, struct hv_pci_dev,
1730 list_entry);
1731 hpdev->reported_missing = true;
1732 }
1733 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1734
1735 /* Next, add back any reported devices. */
1736 for (child_no = 0; child_no < dr->device_count; child_no++) {
1737 found = false;
1738 new_desc = &dr->func[child_no];
1739
1740 spin_lock_irqsave(&hbus->device_list_lock, flags);
1741 list_for_each(iter, &hbus->children) {
1742 hpdev = container_of(iter, struct hv_pci_dev,
1743 list_entry);
1744 if ((hpdev->desc.win_slot.slot ==
1745 new_desc->win_slot.slot) &&
1746 (hpdev->desc.v_id == new_desc->v_id) &&
1747 (hpdev->desc.d_id == new_desc->d_id) &&
1748 (hpdev->desc.ser == new_desc->ser)) {
1749 hpdev->reported_missing = false;
1750 found = true;
1751 }
1752 }
1753 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1754
1755 if (!found) {
1756 hpdev = new_pcichild_device(hbus, new_desc);
1757 if (!hpdev)
1758 dev_err(&hbus->hdev->device,
1759 "couldn't record a child device.\n");
1760 }
1761 }
1762
1763 /* Move missing children to a list on the stack. */
1764 spin_lock_irqsave(&hbus->device_list_lock, flags);
1765 do {
1766 found = false;
1767 list_for_each(iter, &hbus->children) {
1768 hpdev = container_of(iter, struct hv_pci_dev,
1769 list_entry);
1770 if (hpdev->reported_missing) {
1771 found = true;
1772 put_pcichild(hpdev, hv_pcidev_ref_childlist);
1773 list_move_tail(&hpdev->list_entry, &removed);
1774 break;
1775 }
1776 }
1777 } while (found);
1778 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1779
1780 /* Delete everything that should no longer exist. */
1781 while (!list_empty(&removed)) {
1782 hpdev = list_first_entry(&removed, struct hv_pci_dev,
1783 list_entry);
1784 list_del(&hpdev->list_entry);
1785 put_pcichild(hpdev, hv_pcidev_ref_initial);
1786 }
1787
1788 switch (hbus->state) {
1789 case hv_pcibus_installed:
1790 /*
1791 * Tell the core to rescan bus
1792 * because there may have been changes.
1793 */
1794 pci_lock_rescan_remove();
1795 pci_scan_child_bus(hbus->pci_bus);
1796 pci_unlock_rescan_remove();
1797 break;
1798
1799 case hv_pcibus_init:
1800 case hv_pcibus_probed:
1801 survey_child_resources(hbus);
1802 break;
1803
1804 default:
1805 break;
1806 }
1807
1808 put_hvpcibus(hbus);
1809 kfree(dr);
1810 }
1811
1812 /**
1813 * hv_pci_devices_present() - Handles list of new children
1814 * @hbus: Root PCI bus, as understood by this driver
1815 * @relations: Packet from host listing children
1816 *
1817 * This function is invoked whenever a new list of devices for
1818 * this bus appears.
1819 */
1820 static void hv_pci_devices_present(struct hv_pcibus_device *hbus,
1821 struct pci_bus_relations *relations)
1822 {
1823 struct hv_dr_state *dr;
1824 struct hv_dr_work *dr_wrk;
1825 unsigned long flags;
1826 bool pending_dr;
1827
1828 dr_wrk = kzalloc(sizeof(*dr_wrk), GFP_NOWAIT);
1829 if (!dr_wrk)
1830 return;
1831
1832 dr = kzalloc(offsetof(struct hv_dr_state, func) +
1833 (sizeof(struct pci_function_description) *
1834 (relations->device_count)), GFP_NOWAIT);
1835 if (!dr) {
1836 kfree(dr_wrk);
1837 return;
1838 }
1839
1840 INIT_WORK(&dr_wrk->wrk, pci_devices_present_work);
1841 dr_wrk->bus = hbus;
1842 dr->device_count = relations->device_count;
1843 if (dr->device_count != 0) {
1844 memcpy(dr->func, relations->func,
1845 sizeof(struct pci_function_description) *
1846 dr->device_count);
1847 }
1848
1849 spin_lock_irqsave(&hbus->device_list_lock, flags);
1850 /*
1851 * If pending_dr is true, we have already queued a work,
1852 * which will see the new dr. Otherwise, we need to
1853 * queue a new work.
1854 */
1855 pending_dr = !list_empty(&hbus->dr_list);
1856 list_add_tail(&dr->list_entry, &hbus->dr_list);
1857 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1858
1859 if (pending_dr) {
1860 kfree(dr_wrk);
1861 } else {
1862 get_hvpcibus(hbus);
1863 queue_work(hbus->wq, &dr_wrk->wrk);
1864 }
1865 }
1866
1867 /**
1868 * hv_eject_device_work() - Asynchronously handles ejection
1869 * @work: Work struct embedded in internal device struct
1870 *
1871 * This function handles ejecting a device. Windows will
1872 * attempt to gracefully eject a device, waiting 60 seconds to
1873 * hear back from the guest OS that this completed successfully.
1874 * If this timer expires, the device will be forcibly removed.
1875 */
1876 static void hv_eject_device_work(struct work_struct *work)
1877 {
1878 struct pci_eject_response *ejct_pkt;
1879 struct hv_pci_dev *hpdev;
1880 struct pci_dev *pdev;
1881 unsigned long flags;
1882 int wslot;
1883 struct {
1884 struct pci_packet pkt;
1885 u8 buffer[sizeof(struct pci_eject_response)];
1886 } ctxt;
1887
1888 hpdev = container_of(work, struct hv_pci_dev, wrk);
1889
1890 WARN_ON(hpdev->state != hv_pcichild_ejecting);
1891
1892 /*
1893 * Ejection can come before or after the PCI bus has been set up, so
1894 * attempt to find it and tear down the bus state, if it exists. This
1895 * must be done without constructs like pci_domain_nr(hbus->pci_bus)
1896 * because hbus->pci_bus may not exist yet.
1897 */
1898 wslot = wslot_to_devfn(hpdev->desc.win_slot.slot);
1899 pdev = pci_get_domain_bus_and_slot(hpdev->hbus->sysdata.domain, 0,
1900 wslot);
1901 if (pdev) {
1902 pci_lock_rescan_remove();
1903 pci_stop_and_remove_bus_device(pdev);
1904 pci_dev_put(pdev);
1905 pci_unlock_rescan_remove();
1906 }
1907
1908 spin_lock_irqsave(&hpdev->hbus->device_list_lock, flags);
1909 list_del(&hpdev->list_entry);
1910 spin_unlock_irqrestore(&hpdev->hbus->device_list_lock, flags);
1911
1912 memset(&ctxt, 0, sizeof(ctxt));
1913 ejct_pkt = (struct pci_eject_response *)&ctxt.pkt.message;
1914 ejct_pkt->message_type.type = PCI_EJECTION_COMPLETE;
1915 ejct_pkt->wslot.slot = hpdev->desc.win_slot.slot;
1916 vmbus_sendpacket(hpdev->hbus->hdev->channel, ejct_pkt,
1917 sizeof(*ejct_pkt), (unsigned long)&ctxt.pkt,
1918 VM_PKT_DATA_INBAND, 0);
1919
1920 put_pcichild(hpdev, hv_pcidev_ref_childlist);
1921 put_pcichild(hpdev, hv_pcidev_ref_pnp);
1922 put_hvpcibus(hpdev->hbus);
1923 }
1924
1925 /**
1926 * hv_pci_eject_device() - Handles device ejection
1927 * @hpdev: Internal device tracking struct
1928 *
1929 * This function is invoked when an ejection packet arrives. It
1930 * just schedules work so that we don't re-enter the packet
1931 * delivery code handling the ejection.
1932 */
1933 static void hv_pci_eject_device(struct hv_pci_dev *hpdev)
1934 {
1935 hpdev->state = hv_pcichild_ejecting;
1936 get_pcichild(hpdev, hv_pcidev_ref_pnp);
1937 INIT_WORK(&hpdev->wrk, hv_eject_device_work);
1938 get_hvpcibus(hpdev->hbus);
1939 queue_work(hpdev->hbus->wq, &hpdev->wrk);
1940 }
1941
1942 /**
1943 * hv_pci_onchannelcallback() - Handles incoming packets
1944 * @context: Internal bus tracking struct
1945 *
1946 * This function is invoked whenever the host sends a packet to
1947 * this channel (which is private to this root PCI bus).
1948 */
1949 static void hv_pci_onchannelcallback(void *context)
1950 {
1951 const int packet_size = 0x100;
1952 int ret;
1953 struct hv_pcibus_device *hbus = context;
1954 u32 bytes_recvd;
1955 u64 req_id;
1956 struct vmpacket_descriptor *desc;
1957 unsigned char *buffer;
1958 int bufferlen = packet_size;
1959 struct pci_packet *comp_packet;
1960 struct pci_response *response;
1961 struct pci_incoming_message *new_message;
1962 struct pci_bus_relations *bus_rel;
1963 struct pci_dev_incoming *dev_message;
1964 struct hv_pci_dev *hpdev;
1965
1966 buffer = kmalloc(bufferlen, GFP_ATOMIC);
1967 if (!buffer)
1968 return;
1969
1970 while (1) {
1971 ret = vmbus_recvpacket_raw(hbus->hdev->channel, buffer,
1972 bufferlen, &bytes_recvd, &req_id);
1973
1974 if (ret == -ENOBUFS) {
1975 kfree(buffer);
1976 /* Handle large packet */
1977 bufferlen = bytes_recvd;
1978 buffer = kmalloc(bytes_recvd, GFP_ATOMIC);
1979 if (!buffer)
1980 return;
1981 continue;
1982 }
1983
1984 /* Zero length indicates there are no more packets. */
1985 if (ret || !bytes_recvd)
1986 break;
1987
1988 /*
1989 * All incoming packets must be at least as large as a
1990 * response.
1991 */
1992 if (bytes_recvd <= sizeof(struct pci_response))
1993 continue;
1994 desc = (struct vmpacket_descriptor *)buffer;
1995
1996 switch (desc->type) {
1997 case VM_PKT_COMP:
1998
1999 /*
2000 * The host is trusted, and thus it's safe to interpret
2001 * this transaction ID as a pointer.
2002 */
2003 comp_packet = (struct pci_packet *)req_id;
2004 response = (struct pci_response *)buffer;
2005 comp_packet->completion_func(comp_packet->compl_ctxt,
2006 response,
2007 bytes_recvd);
2008 break;
2009
2010 case VM_PKT_DATA_INBAND:
2011
2012 new_message = (struct pci_incoming_message *)buffer;
2013 switch (new_message->message_type.type) {
2014 case PCI_BUS_RELATIONS:
2015
2016 bus_rel = (struct pci_bus_relations *)buffer;
2017 if (bytes_recvd <
2018 offsetof(struct pci_bus_relations, func) +
2019 (sizeof(struct pci_function_description) *
2020 (bus_rel->device_count))) {
2021 dev_err(&hbus->hdev->device,
2022 "bus relations too small\n");
2023 break;
2024 }
2025
2026 hv_pci_devices_present(hbus, bus_rel);
2027 break;
2028
2029 case PCI_EJECT:
2030
2031 dev_message = (struct pci_dev_incoming *)buffer;
2032 hpdev = get_pcichild_wslot(hbus,
2033 dev_message->wslot.slot);
2034 if (hpdev) {
2035 hv_pci_eject_device(hpdev);
2036 put_pcichild(hpdev,
2037 hv_pcidev_ref_by_slot);
2038 }
2039 break;
2040
2041 default:
2042 dev_warn(&hbus->hdev->device,
2043 "Unimplemented protocol message %x\n",
2044 new_message->message_type.type);
2045 break;
2046 }
2047 break;
2048
2049 default:
2050 dev_err(&hbus->hdev->device,
2051 "unhandled packet type %d, tid %llx len %d\n",
2052 desc->type, req_id, bytes_recvd);
2053 break;
2054 }
2055 }
2056
2057 kfree(buffer);
2058 }
2059
2060 /**
2061 * hv_pci_protocol_negotiation() - Set up protocol
2062 * @hdev: VMBus's tracking struct for this root PCI bus
2063 *
2064 * This driver is intended to support running on Windows 10
2065 * (server) and later versions. It will not run on earlier
2066 * versions, as they assume that many of the operations which
2067 * Linux needs accomplished with a spinlock held were done via
2068 * asynchronous messaging via VMBus. Windows 10 increases the
2069 * surface area of PCI emulation so that these actions can take
2070 * place by suspending a virtual processor for their duration.
2071 *
2072 * This function negotiates the channel protocol version,
2073 * failing if the host doesn't support the necessary protocol
2074 * level.
2075 */
2076 static int hv_pci_protocol_negotiation(struct hv_device *hdev)
2077 {
2078 struct pci_version_request *version_req;
2079 struct hv_pci_compl comp_pkt;
2080 struct pci_packet *pkt;
2081 int ret;
2082 int i;
2083
2084 /*
2085 * Initiate the handshake with the host and negotiate
2086 * a version that the host can support. We start with the
2087 * highest version number and go down if the host cannot
2088 * support it.
2089 */
2090 pkt = kzalloc(sizeof(*pkt) + sizeof(*version_req), GFP_KERNEL);
2091 if (!pkt)
2092 return -ENOMEM;
2093
2094 init_completion(&comp_pkt.host_event);
2095 pkt->completion_func = hv_pci_generic_compl;
2096 pkt->compl_ctxt = &comp_pkt;
2097 version_req = (struct pci_version_request *)&pkt->message;
2098 version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION;
2099
2100 for (i = 0; i < ARRAY_SIZE(pci_protocol_versions); i++) {
2101 version_req->protocol_version = pci_protocol_versions[i];
2102 ret = vmbus_sendpacket(hdev->channel, version_req,
2103 sizeof(struct pci_version_request),
2104 (unsigned long)pkt, VM_PKT_DATA_INBAND,
2105 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
2106 if (!ret)
2107 ret = wait_for_response(hdev, &comp_pkt.host_event);
2108
2109 if (ret) {
2110 dev_err(&hdev->device,
2111 "PCI Pass-through VSP failed to request version: %d",
2112 ret);
2113 goto exit;
2114 }
2115
2116 if (comp_pkt.completion_status >= 0) {
2117 pci_protocol_version = pci_protocol_versions[i];
2118 dev_info(&hdev->device,
2119 "PCI VMBus probing: Using version %#x\n",
2120 pci_protocol_version);
2121 goto exit;
2122 }
2123
2124 if (comp_pkt.completion_status != STATUS_REVISION_MISMATCH) {
2125 dev_err(&hdev->device,
2126 "PCI Pass-through VSP failed version request: %#x",
2127 comp_pkt.completion_status);
2128 ret = -EPROTO;
2129 goto exit;
2130 }
2131
2132 reinit_completion(&comp_pkt.host_event);
2133 }
2134
2135 dev_err(&hdev->device,
2136 "PCI pass-through VSP failed to find supported version");
2137 ret = -EPROTO;
2138
2139 exit:
2140 kfree(pkt);
2141 return ret;
2142 }
2143
2144 /**
2145 * hv_pci_free_bridge_windows() - Release memory regions for the
2146 * bus
2147 * @hbus: Root PCI bus, as understood by this driver
2148 */
2149 static void hv_pci_free_bridge_windows(struct hv_pcibus_device *hbus)
2150 {
2151 /*
2152 * Set the resources back to the way they looked when they
2153 * were allocated by setting IORESOURCE_BUSY again.
2154 */
2155
2156 if (hbus->low_mmio_space && hbus->low_mmio_res) {
2157 hbus->low_mmio_res->flags |= IORESOURCE_BUSY;
2158 vmbus_free_mmio(hbus->low_mmio_res->start,
2159 resource_size(hbus->low_mmio_res));
2160 }
2161
2162 if (hbus->high_mmio_space && hbus->high_mmio_res) {
2163 hbus->high_mmio_res->flags |= IORESOURCE_BUSY;
2164 vmbus_free_mmio(hbus->high_mmio_res->start,
2165 resource_size(hbus->high_mmio_res));
2166 }
2167 }
2168
2169 /**
2170 * hv_pci_allocate_bridge_windows() - Allocate memory regions
2171 * for the bus
2172 * @hbus: Root PCI bus, as understood by this driver
2173 *
2174 * This function calls vmbus_allocate_mmio(), which is itself a
2175 * bit of a compromise. Ideally, we might change the pnp layer
2176 * in the kernel such that it comprehends either PCI devices
2177 * which are "grandchildren of ACPI," with some intermediate bus
2178 * node (in this case, VMBus) or change it such that it
2179 * understands VMBus. The pnp layer, however, has been declared
2180 * deprecated, and not subject to change.
2181 *
2182 * The workaround, implemented here, is to ask VMBus to allocate
2183 * MMIO space for this bus. VMBus itself knows which ranges are
2184 * appropriate by looking at its own ACPI objects. Then, after
2185 * these ranges are claimed, they're modified to look like they
2186 * would have looked if the ACPI and pnp code had allocated
2187 * bridge windows. These descriptors have to exist in this form
2188 * in order to satisfy the code which will get invoked when the
2189 * endpoint PCI function driver calls request_mem_region() or
2190 * request_mem_region_exclusive().
2191 *
2192 * Return: 0 on success, -errno on failure
2193 */
2194 static int hv_pci_allocate_bridge_windows(struct hv_pcibus_device *hbus)
2195 {
2196 resource_size_t align;
2197 int ret;
2198
2199 if (hbus->low_mmio_space) {
2200 align = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space));
2201 ret = vmbus_allocate_mmio(&hbus->low_mmio_res, hbus->hdev, 0,
2202 (u64)(u32)0xffffffff,
2203 hbus->low_mmio_space,
2204 align, false);
2205 if (ret) {
2206 dev_err(&hbus->hdev->device,
2207 "Need %#llx of low MMIO space. Consider reconfiguring the VM.\n",
2208 hbus->low_mmio_space);
2209 return ret;
2210 }
2211
2212 /* Modify this resource to become a bridge window. */
2213 hbus->low_mmio_res->flags |= IORESOURCE_WINDOW;
2214 hbus->low_mmio_res->flags &= ~IORESOURCE_BUSY;
2215 pci_add_resource(&hbus->resources_for_children,
2216 hbus->low_mmio_res);
2217 }
2218
2219 if (hbus->high_mmio_space) {
2220 align = 1ULL << (63 - __builtin_clzll(hbus->high_mmio_space));
2221 ret = vmbus_allocate_mmio(&hbus->high_mmio_res, hbus->hdev,
2222 0x100000000, -1,
2223 hbus->high_mmio_space, align,
2224 false);
2225 if (ret) {
2226 dev_err(&hbus->hdev->device,
2227 "Need %#llx of high MMIO space. Consider reconfiguring the VM.\n",
2228 hbus->high_mmio_space);
2229 goto release_low_mmio;
2230 }
2231
2232 /* Modify this resource to become a bridge window. */
2233 hbus->high_mmio_res->flags |= IORESOURCE_WINDOW;
2234 hbus->high_mmio_res->flags &= ~IORESOURCE_BUSY;
2235 pci_add_resource(&hbus->resources_for_children,
2236 hbus->high_mmio_res);
2237 }
2238
2239 return 0;
2240
2241 release_low_mmio:
2242 if (hbus->low_mmio_res) {
2243 vmbus_free_mmio(hbus->low_mmio_res->start,
2244 resource_size(hbus->low_mmio_res));
2245 }
2246
2247 return ret;
2248 }
2249
2250 /**
2251 * hv_allocate_config_window() - Find MMIO space for PCI Config
2252 * @hbus: Root PCI bus, as understood by this driver
2253 *
2254 * This function claims memory-mapped I/O space for accessing
2255 * configuration space for the functions on this bus.
2256 *
2257 * Return: 0 on success, -errno on failure
2258 */
2259 static int hv_allocate_config_window(struct hv_pcibus_device *hbus)
2260 {
2261 int ret;
2262
2263 /*
2264 * Set up a region of MMIO space to use for accessing configuration
2265 * space.
2266 */
2267 ret = vmbus_allocate_mmio(&hbus->mem_config, hbus->hdev, 0, -1,
2268 PCI_CONFIG_MMIO_LENGTH, 0x1000, false);
2269 if (ret)
2270 return ret;
2271
2272 /*
2273 * vmbus_allocate_mmio() gets used for allocating both device endpoint
2274 * resource claims (those which cannot be overlapped) and the ranges
2275 * which are valid for the children of this bus, which are intended
2276 * to be overlapped by those children. Set the flag on this claim
2277 * meaning that this region can't be overlapped.
2278 */
2279
2280 hbus->mem_config->flags |= IORESOURCE_BUSY;
2281
2282 return 0;
2283 }
2284
2285 static void hv_free_config_window(struct hv_pcibus_device *hbus)
2286 {
2287 vmbus_free_mmio(hbus->mem_config->start, PCI_CONFIG_MMIO_LENGTH);
2288 }
2289
2290 /**
2291 * hv_pci_enter_d0() - Bring the "bus" into the D0 power state
2292 * @hdev: VMBus's tracking struct for this root PCI bus
2293 *
2294 * Return: 0 on success, -errno on failure
2295 */
2296 static int hv_pci_enter_d0(struct hv_device *hdev)
2297 {
2298 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2299 struct pci_bus_d0_entry *d0_entry;
2300 struct hv_pci_compl comp_pkt;
2301 struct pci_packet *pkt;
2302 int ret;
2303
2304 /*
2305 * Tell the host that the bus is ready to use, and moved into the
2306 * powered-on state. This includes telling the host which region
2307 * of memory-mapped I/O space has been chosen for configuration space
2308 * access.
2309 */
2310 pkt = kzalloc(sizeof(*pkt) + sizeof(*d0_entry), GFP_KERNEL);
2311 if (!pkt)
2312 return -ENOMEM;
2313
2314 init_completion(&comp_pkt.host_event);
2315 pkt->completion_func = hv_pci_generic_compl;
2316 pkt->compl_ctxt = &comp_pkt;
2317 d0_entry = (struct pci_bus_d0_entry *)&pkt->message;
2318 d0_entry->message_type.type = PCI_BUS_D0ENTRY;
2319 d0_entry->mmio_base = hbus->mem_config->start;
2320
2321 ret = vmbus_sendpacket(hdev->channel, d0_entry, sizeof(*d0_entry),
2322 (unsigned long)pkt, VM_PKT_DATA_INBAND,
2323 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
2324 if (!ret)
2325 ret = wait_for_response(hdev, &comp_pkt.host_event);
2326
2327 if (ret)
2328 goto exit;
2329
2330 if (comp_pkt.completion_status < 0) {
2331 dev_err(&hdev->device,
2332 "PCI Pass-through VSP failed D0 Entry with status %x\n",
2333 comp_pkt.completion_status);
2334 ret = -EPROTO;
2335 goto exit;
2336 }
2337
2338 ret = 0;
2339
2340 exit:
2341 kfree(pkt);
2342 return ret;
2343 }
2344
2345 /**
2346 * hv_pci_query_relations() - Ask host to send list of child
2347 * devices
2348 * @hdev: VMBus's tracking struct for this root PCI bus
2349 *
2350 * Return: 0 on success, -errno on failure
2351 */
2352 static int hv_pci_query_relations(struct hv_device *hdev)
2353 {
2354 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2355 struct pci_message message;
2356 struct completion comp;
2357 int ret;
2358
2359 /* Ask the host to send along the list of child devices */
2360 init_completion(&comp);
2361 if (cmpxchg(&hbus->survey_event, NULL, &comp))
2362 return -ENOTEMPTY;
2363
2364 memset(&message, 0, sizeof(message));
2365 message.type = PCI_QUERY_BUS_RELATIONS;
2366
2367 ret = vmbus_sendpacket(hdev->channel, &message, sizeof(message),
2368 0, VM_PKT_DATA_INBAND, 0);
2369 if (!ret)
2370 ret = wait_for_response(hdev, &comp);
2371
2372 return ret;
2373 }
2374
2375 /**
2376 * hv_send_resources_allocated() - Report local resource choices
2377 * @hdev: VMBus's tracking struct for this root PCI bus
2378 *
2379 * The host OS is expecting to be sent a request as a message
2380 * which contains all the resources that the device will use.
2381 * The response contains those same resources, "translated"
2382 * which is to say, the values which should be used by the
2383 * hardware, when it delivers an interrupt. (MMIO resources are
2384 * used in local terms.) This is nice for Windows, and lines up
2385 * with the FDO/PDO split, which doesn't exist in Linux. Linux
2386 * is deeply expecting to scan an emulated PCI configuration
2387 * space. So this message is sent here only to drive the state
2388 * machine on the host forward.
2389 *
2390 * Return: 0 on success, -errno on failure
2391 */
2392 static int hv_send_resources_allocated(struct hv_device *hdev)
2393 {
2394 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2395 struct pci_resources_assigned *res_assigned;
2396 struct pci_resources_assigned2 *res_assigned2;
2397 struct hv_pci_compl comp_pkt;
2398 struct hv_pci_dev *hpdev;
2399 struct pci_packet *pkt;
2400 size_t size_res;
2401 u32 wslot;
2402 int ret;
2403
2404 size_res = (pci_protocol_version < PCI_PROTOCOL_VERSION_1_2)
2405 ? sizeof(*res_assigned) : sizeof(*res_assigned2);
2406
2407 pkt = kmalloc(sizeof(*pkt) + size_res, GFP_KERNEL);
2408 if (!pkt)
2409 return -ENOMEM;
2410
2411 ret = 0;
2412
2413 for (wslot = 0; wslot < 256; wslot++) {
2414 hpdev = get_pcichild_wslot(hbus, wslot);
2415 if (!hpdev)
2416 continue;
2417
2418 memset(pkt, 0, sizeof(*pkt) + size_res);
2419 init_completion(&comp_pkt.host_event);
2420 pkt->completion_func = hv_pci_generic_compl;
2421 pkt->compl_ctxt = &comp_pkt;
2422
2423 if (pci_protocol_version < PCI_PROTOCOL_VERSION_1_2) {
2424 res_assigned =
2425 (struct pci_resources_assigned *)&pkt->message;
2426 res_assigned->message_type.type =
2427 PCI_RESOURCES_ASSIGNED;
2428 res_assigned->wslot.slot = hpdev->desc.win_slot.slot;
2429 } else {
2430 res_assigned2 =
2431 (struct pci_resources_assigned2 *)&pkt->message;
2432 res_assigned2->message_type.type =
2433 PCI_RESOURCES_ASSIGNED2;
2434 res_assigned2->wslot.slot = hpdev->desc.win_slot.slot;
2435 }
2436 put_pcichild(hpdev, hv_pcidev_ref_by_slot);
2437
2438 ret = vmbus_sendpacket(hdev->channel, &pkt->message,
2439 size_res, (unsigned long)pkt,
2440 VM_PKT_DATA_INBAND,
2441 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
2442 if (!ret)
2443 ret = wait_for_response(hdev, &comp_pkt.host_event);
2444 if (ret)
2445 break;
2446
2447 if (comp_pkt.completion_status < 0) {
2448 ret = -EPROTO;
2449 dev_err(&hdev->device,
2450 "resource allocated returned 0x%x",
2451 comp_pkt.completion_status);
2452 break;
2453 }
2454 }
2455
2456 kfree(pkt);
2457 return ret;
2458 }
2459
2460 /**
2461 * hv_send_resources_released() - Report local resources
2462 * released
2463 * @hdev: VMBus's tracking struct for this root PCI bus
2464 *
2465 * Return: 0 on success, -errno on failure
2466 */
2467 static int hv_send_resources_released(struct hv_device *hdev)
2468 {
2469 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2470 struct pci_child_message pkt;
2471 struct hv_pci_dev *hpdev;
2472 u32 wslot;
2473 int ret;
2474
2475 for (wslot = 0; wslot < 256; wslot++) {
2476 hpdev = get_pcichild_wslot(hbus, wslot);
2477 if (!hpdev)
2478 continue;
2479
2480 memset(&pkt, 0, sizeof(pkt));
2481 pkt.message_type.type = PCI_RESOURCES_RELEASED;
2482 pkt.wslot.slot = hpdev->desc.win_slot.slot;
2483
2484 put_pcichild(hpdev, hv_pcidev_ref_by_slot);
2485
2486 ret = vmbus_sendpacket(hdev->channel, &pkt, sizeof(pkt), 0,
2487 VM_PKT_DATA_INBAND, 0);
2488 if (ret)
2489 return ret;
2490 }
2491
2492 return 0;
2493 }
2494
2495 static void get_hvpcibus(struct hv_pcibus_device *hbus)
2496 {
2497 atomic_inc(&hbus->remove_lock);
2498 }
2499
2500 static void put_hvpcibus(struct hv_pcibus_device *hbus)
2501 {
2502 if (atomic_dec_and_test(&hbus->remove_lock))
2503 complete(&hbus->remove_event);
2504 }
2505
2506 /**
2507 * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
2508 * @hdev: VMBus's tracking struct for this root PCI bus
2509 * @dev_id: Identifies the device itself
2510 *
2511 * Return: 0 on success, -errno on failure
2512 */
2513 static int hv_pci_probe(struct hv_device *hdev,
2514 const struct hv_vmbus_device_id *dev_id)
2515 {
2516 struct hv_pcibus_device *hbus;
2517 int ret;
2518
2519 /*
2520 * hv_pcibus_device contains the hypercall arguments for retargeting in
2521 * hv_irq_unmask(). Those must not cross a page boundary.
2522 */
2523 BUILD_BUG_ON(sizeof(*hbus) > PAGE_SIZE);
2524
2525 hbus = (struct hv_pcibus_device *)get_zeroed_page(GFP_KERNEL);
2526 if (!hbus)
2527 return -ENOMEM;
2528 hbus->state = hv_pcibus_init;
2529
2530 /*
2531 * The PCI bus "domain" is what is called "segment" in ACPI and
2532 * other specs. Pull it from the instance ID, to get something
2533 * unique. Bytes 8 and 9 are what is used in Windows guests, so
2534 * do the same thing for consistency. Note that, since this code
2535 * only runs in a Hyper-V VM, Hyper-V can (and does) guarantee
2536 * that (1) the only domain in use for something that looks like
2537 * a physical PCI bus (which is actually emulated by the
2538 * hypervisor) is domain 0 and (2) there will be no overlap
2539 * between domains derived from these instance IDs in the same
2540 * VM.
2541 */
2542 hbus->sysdata.domain = hdev->dev_instance.b[9] |
2543 hdev->dev_instance.b[8] << 8;
2544
2545 hbus->hdev = hdev;
2546 atomic_inc(&hbus->remove_lock);
2547 INIT_LIST_HEAD(&hbus->children);
2548 INIT_LIST_HEAD(&hbus->dr_list);
2549 INIT_LIST_HEAD(&hbus->resources_for_children);
2550 spin_lock_init(&hbus->config_lock);
2551 spin_lock_init(&hbus->device_list_lock);
2552 spin_lock_init(&hbus->retarget_msi_interrupt_lock);
2553 init_completion(&hbus->remove_event);
2554 hbus->wq = alloc_ordered_workqueue("hv_pci_%x", 0,
2555 hbus->sysdata.domain);
2556 if (!hbus->wq) {
2557 ret = -ENOMEM;
2558 goto free_bus;
2559 }
2560
2561 ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
2562 hv_pci_onchannelcallback, hbus);
2563 if (ret)
2564 goto destroy_wq;
2565
2566 hv_set_drvdata(hdev, hbus);
2567
2568 ret = hv_pci_protocol_negotiation(hdev);
2569 if (ret)
2570 goto close;
2571
2572 ret = hv_allocate_config_window(hbus);
2573 if (ret)
2574 goto close;
2575
2576 hbus->cfg_addr = ioremap(hbus->mem_config->start,
2577 PCI_CONFIG_MMIO_LENGTH);
2578 if (!hbus->cfg_addr) {
2579 dev_err(&hdev->device,
2580 "Unable to map a virtual address for config space\n");
2581 ret = -ENOMEM;
2582 goto free_config;
2583 }
2584
2585 hbus->sysdata.fwnode = irq_domain_alloc_fwnode(hbus);
2586 if (!hbus->sysdata.fwnode) {
2587 ret = -ENOMEM;
2588 goto unmap;
2589 }
2590
2591 ret = hv_pcie_init_irq_domain(hbus);
2592 if (ret)
2593 goto free_fwnode;
2594
2595 ret = hv_pci_query_relations(hdev);
2596 if (ret)
2597 goto free_irq_domain;
2598
2599 ret = hv_pci_enter_d0(hdev);
2600 if (ret)
2601 goto free_irq_domain;
2602
2603 ret = hv_pci_allocate_bridge_windows(hbus);
2604 if (ret)
2605 goto free_irq_domain;
2606
2607 ret = hv_send_resources_allocated(hdev);
2608 if (ret)
2609 goto free_windows;
2610
2611 prepopulate_bars(hbus);
2612
2613 hbus->state = hv_pcibus_probed;
2614
2615 ret = create_root_hv_pci_bus(hbus);
2616 if (ret)
2617 goto free_windows;
2618
2619 return 0;
2620
2621 free_windows:
2622 hv_pci_free_bridge_windows(hbus);
2623 free_irq_domain:
2624 irq_domain_remove(hbus->irq_domain);
2625 free_fwnode:
2626 irq_domain_free_fwnode(hbus->sysdata.fwnode);
2627 unmap:
2628 iounmap(hbus->cfg_addr);
2629 free_config:
2630 hv_free_config_window(hbus);
2631 close:
2632 vmbus_close(hdev->channel);
2633 destroy_wq:
2634 destroy_workqueue(hbus->wq);
2635 free_bus:
2636 free_page((unsigned long)hbus);
2637 return ret;
2638 }
2639
2640 static void hv_pci_bus_exit(struct hv_device *hdev)
2641 {
2642 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2643 struct {
2644 struct pci_packet teardown_packet;
2645 u8 buffer[sizeof(struct pci_message)];
2646 } pkt;
2647 struct pci_bus_relations relations;
2648 struct hv_pci_compl comp_pkt;
2649 int ret;
2650
2651 /*
2652 * After the host sends the RESCIND_CHANNEL message, it doesn't
2653 * access the per-channel ringbuffer any longer.
2654 */
2655 if (hdev->channel->rescind)
2656 return;
2657
2658 /* Delete any children which might still exist. */
2659 memset(&relations, 0, sizeof(relations));
2660 hv_pci_devices_present(hbus, &relations);
2661
2662 ret = hv_send_resources_released(hdev);
2663 if (ret)
2664 dev_err(&hdev->device,
2665 "Couldn't send resources released packet(s)\n");
2666
2667 memset(&pkt.teardown_packet, 0, sizeof(pkt.teardown_packet));
2668 init_completion(&comp_pkt.host_event);
2669 pkt.teardown_packet.completion_func = hv_pci_generic_compl;
2670 pkt.teardown_packet.compl_ctxt = &comp_pkt;
2671 pkt.teardown_packet.message[0].type = PCI_BUS_D0EXIT;
2672
2673 ret = vmbus_sendpacket(hdev->channel, &pkt.teardown_packet.message,
2674 sizeof(struct pci_message),
2675 (unsigned long)&pkt.teardown_packet,
2676 VM_PKT_DATA_INBAND,
2677 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
2678 if (!ret)
2679 wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ);
2680 }
2681
2682 /**
2683 * hv_pci_remove() - Remove routine for this VMBus channel
2684 * @hdev: VMBus's tracking struct for this root PCI bus
2685 *
2686 * Return: 0 on success, -errno on failure
2687 */
2688 static int hv_pci_remove(struct hv_device *hdev)
2689 {
2690 struct hv_pcibus_device *hbus;
2691
2692 hbus = hv_get_drvdata(hdev);
2693 if (hbus->state == hv_pcibus_installed) {
2694 /* Remove the bus from PCI's point of view. */
2695 pci_lock_rescan_remove();
2696 pci_stop_root_bus(hbus->pci_bus);
2697 pci_remove_root_bus(hbus->pci_bus);
2698 pci_unlock_rescan_remove();
2699 hbus->state = hv_pcibus_removed;
2700 }
2701
2702 hv_pci_bus_exit(hdev);
2703
2704 vmbus_close(hdev->channel);
2705
2706 iounmap(hbus->cfg_addr);
2707 hv_free_config_window(hbus);
2708 pci_free_resource_list(&hbus->resources_for_children);
2709 hv_pci_free_bridge_windows(hbus);
2710 irq_domain_remove(hbus->irq_domain);
2711 irq_domain_free_fwnode(hbus->sysdata.fwnode);
2712 put_hvpcibus(hbus);
2713 wait_for_completion(&hbus->remove_event);
2714 destroy_workqueue(hbus->wq);
2715 free_page((unsigned long)hbus);
2716 return 0;
2717 }
2718
2719 static const struct hv_vmbus_device_id hv_pci_id_table[] = {
2720 /* PCI Pass-through Class ID */
2721 /* 44C4F61D-4444-4400-9D52-802E27EDE19F */
2722 { HV_PCIE_GUID, },
2723 { },
2724 };
2725
2726 MODULE_DEVICE_TABLE(vmbus, hv_pci_id_table);
2727
2728 static struct hv_driver hv_pci_drv = {
2729 .name = "hv_pci",
2730 .id_table = hv_pci_id_table,
2731 .probe = hv_pci_probe,
2732 .remove = hv_pci_remove,
2733 };
2734
2735 static void __exit exit_hv_pci_drv(void)
2736 {
2737 vmbus_driver_unregister(&hv_pci_drv);
2738 }
2739
2740 static int __init init_hv_pci_drv(void)
2741 {
2742 return vmbus_driver_register(&hv_pci_drv);
2743 }
2744
2745 module_init(init_hv_pci_drv);
2746 module_exit(exit_hv_pci_drv);
2747
2748 MODULE_DESCRIPTION("Hyper-V PCI");
2749 MODULE_LICENSE("GPL v2");