]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - drivers/pci/host/pci-hyperv.c
PCI: hv: Make sure the bus domain is really unique
[mirror_ubuntu-bionic-kernel.git] / drivers / pci / host / pci-hyperv.c
CommitLineData
4daace0d
JO
1/*
2 * Copyright (c) Microsoft Corporation.
3 *
4 * Author:
5 * Jake Oshins <jakeo@microsoft.com>
6 *
7 * This driver acts as a paravirtual front-end for PCI Express root buses.
8 * When a PCI Express function (either an entire device or an SR-IOV
9 * Virtual Function) is being passed through to the VM, this driver exposes
10 * a new bus to the guest VM. This is modeled as a root PCI bus because
11 * no bridges are being exposed to the VM. In fact, with a "Generation 2"
12 * VM within Hyper-V, there may seem to be no PCI bus at all in the VM
13 * until a device as been exposed using this driver.
14 *
15 * Each root PCI bus has its own PCI domain, which is called "Segment" in
16 * the PCI Firmware Specifications. Thus while each device passed through
17 * to the VM using this front-end will appear at "device 0", the domain will
18 * be unique. Typically, each bus will have one PCI function on it, though
19 * this driver does support more than one.
20 *
21 * In order to map the interrupts from the device through to the guest VM,
22 * this driver also implements an IRQ Domain, which handles interrupts (either
23 * MSI or MSI-X) associated with the functions on the bus. As interrupts are
24 * set up, torn down, or reaffined, this driver communicates with the
25 * underlying hypervisor to adjust the mappings in the I/O MMU so that each
26 * interrupt will be delivered to the correct virtual processor at the right
27 * vector. This driver does not support level-triggered (line-based)
28 * interrupts, and will report that the Interrupt Line register in the
29 * function's configuration space is zero.
30 *
31 * The rest of this driver mostly maps PCI concepts onto underlying Hyper-V
32 * facilities. For instance, the configuration space of a function exposed
33 * by Hyper-V is mapped into a single page of memory space, and the
34 * read and write handlers for config space must be aware of this mechanism.
35 * Similarly, device setup and teardown involves messages sent to and from
36 * the PCI back-end driver in Hyper-V.
37 *
38 * This program is free software; you can redistribute it and/or modify it
39 * under the terms of the GNU General Public License version 2 as published
40 * by the Free Software Foundation.
41 *
42 * This program is distributed in the hope that it will be useful, but
43 * WITHOUT ANY WARRANTY; without even the implied warranty of
44 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
45 * NON INFRINGEMENT. See the GNU General Public License for more
46 * details.
47 *
48 */
49
50#include <linux/kernel.h>
51#include <linux/module.h>
52#include <linux/pci.h>
80bfeeb9 53#include <linux/delay.h>
4daace0d
JO
54#include <linux/semaphore.h>
55#include <linux/irqdomain.h>
56#include <asm/irqdomain.h>
57#include <asm/apic.h>
f901f138 58#include <linux/irq.h>
4daace0d
JO
59#include <linux/msi.h>
60#include <linux/hyperv.h>
24196f0c 61#include <linux/refcount.h>
4daace0d
JO
62#include <asm/mshyperv.h>
63
64/*
65 * Protocol versions. The low word is the minor version, the high word the
66 * major version.
67 */
68
b1db7e7e 69#define PCI_MAKE_VERSION(major, minor) ((u32)(((major) << 16) | (minor)))
4daace0d
JO
70#define PCI_MAJOR_VERSION(version) ((u32)(version) >> 16)
71#define PCI_MINOR_VERSION(version) ((u32)(version) & 0xff)
72
b1db7e7e
JL
73enum pci_protocol_version_t {
74 PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1), /* Win10 */
7dcf90e9 75 PCI_PROTOCOL_VERSION_1_2 = PCI_MAKE_VERSION(1, 2), /* RS1 */
4daace0d
JO
76};
77
433fcf6b 78#define CPU_AFFINITY_ALL -1ULL
b1db7e7e
JL
79
80/*
81 * Supported protocol versions in the order of probing - highest go
82 * first.
83 */
84static enum pci_protocol_version_t pci_protocol_versions[] = {
7dcf90e9 85 PCI_PROTOCOL_VERSION_1_2,
b1db7e7e
JL
86 PCI_PROTOCOL_VERSION_1_1,
87};
88
89/*
90 * Protocol version negotiated by hv_pci_protocol_negotiation().
91 */
92static enum pci_protocol_version_t pci_protocol_version;
93
4daace0d
JO
94#define PCI_CONFIG_MMIO_LENGTH 0x2000
95#define CFG_PAGE_OFFSET 0x1000
96#define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET)
97
98#define MAX_SUPPORTED_MSI_MESSAGES 0x400
99
b1db7e7e
JL
100#define STATUS_REVISION_MISMATCH 0xC0000059
101
4daace0d
JO
102/*
103 * Message Types
104 */
105
106enum pci_message_type {
107 /*
108 * Version 1.1
109 */
110 PCI_MESSAGE_BASE = 0x42490000,
111 PCI_BUS_RELATIONS = PCI_MESSAGE_BASE + 0,
112 PCI_QUERY_BUS_RELATIONS = PCI_MESSAGE_BASE + 1,
113 PCI_POWER_STATE_CHANGE = PCI_MESSAGE_BASE + 4,
114 PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5,
115 PCI_QUERY_RESOURCE_RESOURCES = PCI_MESSAGE_BASE + 6,
116 PCI_BUS_D0ENTRY = PCI_MESSAGE_BASE + 7,
117 PCI_BUS_D0EXIT = PCI_MESSAGE_BASE + 8,
118 PCI_READ_BLOCK = PCI_MESSAGE_BASE + 9,
119 PCI_WRITE_BLOCK = PCI_MESSAGE_BASE + 0xA,
120 PCI_EJECT = PCI_MESSAGE_BASE + 0xB,
121 PCI_QUERY_STOP = PCI_MESSAGE_BASE + 0xC,
122 PCI_REENABLE = PCI_MESSAGE_BASE + 0xD,
123 PCI_QUERY_STOP_FAILED = PCI_MESSAGE_BASE + 0xE,
124 PCI_EJECTION_COMPLETE = PCI_MESSAGE_BASE + 0xF,
125 PCI_RESOURCES_ASSIGNED = PCI_MESSAGE_BASE + 0x10,
126 PCI_RESOURCES_RELEASED = PCI_MESSAGE_BASE + 0x11,
127 PCI_INVALIDATE_BLOCK = PCI_MESSAGE_BASE + 0x12,
128 PCI_QUERY_PROTOCOL_VERSION = PCI_MESSAGE_BASE + 0x13,
129 PCI_CREATE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x14,
130 PCI_DELETE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x15,
7dcf90e9
JL
131 PCI_RESOURCES_ASSIGNED2 = PCI_MESSAGE_BASE + 0x16,
132 PCI_CREATE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x17,
133 PCI_DELETE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x18, /* unused */
4daace0d
JO
134 PCI_MESSAGE_MAXIMUM
135};
136
137/*
138 * Structures defining the virtual PCI Express protocol.
139 */
140
141union pci_version {
142 struct {
143 u16 minor_version;
144 u16 major_version;
145 } parts;
146 u32 version;
147} __packed;
148
149/*
150 * Function numbers are 8-bits wide on Express, as interpreted through ARI,
151 * which is all this driver does. This representation is the one used in
152 * Windows, which is what is expected when sending this back and forth with
153 * the Hyper-V parent partition.
154 */
155union win_slot_encoding {
156 struct {
60e2e2fb
DC
157 u32 dev:5;
158 u32 func:3;
4daace0d
JO
159 u32 reserved:24;
160 } bits;
161 u32 slot;
162} __packed;
163
164/*
165 * Pretty much as defined in the PCI Specifications.
166 */
167struct pci_function_description {
168 u16 v_id; /* vendor ID */
169 u16 d_id; /* device ID */
170 u8 rev;
171 u8 prog_intf;
172 u8 subclass;
173 u8 base_class;
174 u32 subsystem_id;
175 union win_slot_encoding win_slot;
176 u32 ser; /* serial number */
177} __packed;
178
179/**
180 * struct hv_msi_desc
181 * @vector: IDT entry
182 * @delivery_mode: As defined in Intel's Programmer's
183 * Reference Manual, Volume 3, Chapter 8.
184 * @vector_count: Number of contiguous entries in the
185 * Interrupt Descriptor Table that are
186 * occupied by this Message-Signaled
187 * Interrupt. For "MSI", as first defined
188 * in PCI 2.2, this can be between 1 and
189 * 32. For "MSI-X," as first defined in PCI
190 * 3.0, this must be 1, as each MSI-X table
191 * entry would have its own descriptor.
192 * @reserved: Empty space
193 * @cpu_mask: All the target virtual processors.
194 */
195struct hv_msi_desc {
196 u8 vector;
197 u8 delivery_mode;
198 u16 vector_count;
199 u32 reserved;
200 u64 cpu_mask;
201} __packed;
202
7dcf90e9
JL
203/**
204 * struct hv_msi_desc2 - 1.2 version of hv_msi_desc
205 * @vector: IDT entry
206 * @delivery_mode: As defined in Intel's Programmer's
207 * Reference Manual, Volume 3, Chapter 8.
208 * @vector_count: Number of contiguous entries in the
209 * Interrupt Descriptor Table that are
210 * occupied by this Message-Signaled
211 * Interrupt. For "MSI", as first defined
212 * in PCI 2.2, this can be between 1 and
213 * 32. For "MSI-X," as first defined in PCI
214 * 3.0, this must be 1, as each MSI-X table
215 * entry would have its own descriptor.
216 * @processor_count: number of bits enabled in array.
217 * @processor_array: All the target virtual processors.
218 */
219struct hv_msi_desc2 {
220 u8 vector;
221 u8 delivery_mode;
222 u16 vector_count;
223 u16 processor_count;
224 u16 processor_array[32];
225} __packed;
226
4daace0d
JO
227/**
228 * struct tran_int_desc
229 * @reserved: unused, padding
230 * @vector_count: same as in hv_msi_desc
231 * @data: This is the "data payload" value that is
232 * written by the device when it generates
233 * a message-signaled interrupt, either MSI
234 * or MSI-X.
235 * @address: This is the address to which the data
236 * payload is written on interrupt
237 * generation.
238 */
239struct tran_int_desc {
240 u16 reserved;
241 u16 vector_count;
242 u32 data;
243 u64 address;
244} __packed;
245
246/*
247 * A generic message format for virtual PCI.
248 * Specific message formats are defined later in the file.
249 */
250
251struct pci_message {
0c6045d8 252 u32 type;
4daace0d
JO
253} __packed;
254
255struct pci_child_message {
0c6045d8 256 struct pci_message message_type;
4daace0d
JO
257 union win_slot_encoding wslot;
258} __packed;
259
260struct pci_incoming_message {
261 struct vmpacket_descriptor hdr;
262 struct pci_message message_type;
263} __packed;
264
265struct pci_response {
266 struct vmpacket_descriptor hdr;
267 s32 status; /* negative values are failures */
268} __packed;
269
270struct pci_packet {
271 void (*completion_func)(void *context, struct pci_response *resp,
272 int resp_packet_size);
273 void *compl_ctxt;
0c6045d8
DC
274
275 struct pci_message message[0];
4daace0d
JO
276};
277
278/*
279 * Specific message types supporting the PCI protocol.
280 */
281
282/*
283 * Version negotiation message. Sent from the guest to the host.
284 * The guest is free to try different versions until the host
285 * accepts the version.
286 *
287 * pci_version: The protocol version requested.
288 * is_last_attempt: If TRUE, this is the last version guest will request.
289 * reservedz: Reserved field, set to zero.
290 */
291
292struct pci_version_request {
293 struct pci_message message_type;
691ac1dc 294 u32 protocol_version;
4daace0d
JO
295} __packed;
296
297/*
298 * Bus D0 Entry. This is sent from the guest to the host when the virtual
299 * bus (PCI Express port) is ready for action.
300 */
301
302struct pci_bus_d0_entry {
303 struct pci_message message_type;
304 u32 reserved;
305 u64 mmio_base;
306} __packed;
307
308struct pci_bus_relations {
309 struct pci_incoming_message incoming;
310 u32 device_count;
7d0f8eec 311 struct pci_function_description func[0];
4daace0d
JO
312} __packed;
313
314struct pci_q_res_req_response {
315 struct vmpacket_descriptor hdr;
316 s32 status; /* negative values are failures */
317 u32 probed_bar[6];
318} __packed;
319
320struct pci_set_power {
321 struct pci_message message_type;
322 union win_slot_encoding wslot;
323 u32 power_state; /* In Windows terms */
324 u32 reserved;
325} __packed;
326
327struct pci_set_power_response {
328 struct vmpacket_descriptor hdr;
329 s32 status; /* negative values are failures */
330 union win_slot_encoding wslot;
331 u32 resultant_state; /* In Windows terms */
332 u32 reserved;
333} __packed;
334
335struct pci_resources_assigned {
336 struct pci_message message_type;
337 union win_slot_encoding wslot;
338 u8 memory_range[0x14][6]; /* not used here */
339 u32 msi_descriptors;
340 u32 reserved[4];
341} __packed;
342
7dcf90e9
JL
343struct pci_resources_assigned2 {
344 struct pci_message message_type;
345 union win_slot_encoding wslot;
346 u8 memory_range[0x14][6]; /* not used here */
347 u32 msi_descriptor_count;
348 u8 reserved[70];
349} __packed;
350
4daace0d
JO
351struct pci_create_interrupt {
352 struct pci_message message_type;
353 union win_slot_encoding wslot;
354 struct hv_msi_desc int_desc;
355} __packed;
356
357struct pci_create_int_response {
358 struct pci_response response;
359 u32 reserved;
360 struct tran_int_desc int_desc;
361} __packed;
362
7dcf90e9
JL
363struct pci_create_interrupt2 {
364 struct pci_message message_type;
365 union win_slot_encoding wslot;
366 struct hv_msi_desc2 int_desc;
367} __packed;
368
4daace0d
JO
369struct pci_delete_interrupt {
370 struct pci_message message_type;
371 union win_slot_encoding wslot;
372 struct tran_int_desc int_desc;
373} __packed;
374
375struct pci_dev_incoming {
376 struct pci_incoming_message incoming;
377 union win_slot_encoding wslot;
378} __packed;
379
380struct pci_eject_response {
0c6045d8 381 struct pci_message message_type;
4daace0d
JO
382 union win_slot_encoding wslot;
383 u32 status;
384} __packed;
385
386static int pci_ring_size = (4 * PAGE_SIZE);
387
388/*
389 * Definitions or interrupt steering hypercall.
390 */
391#define HV_PARTITION_ID_SELF ((u64)-1)
392#define HVCALL_RETARGET_INTERRUPT 0x7e
393
7dcf90e9 394struct hv_interrupt_entry {
4daace0d
JO
395 u32 source; /* 1 for MSI(-X) */
396 u32 reserved1;
397 u32 address;
398 u32 data;
7dcf90e9
JL
399};
400
401#define HV_VP_SET_BANK_COUNT_MAX 5 /* current implementation limit */
402
403struct hv_vp_set {
404 u64 format; /* 0 (HvGenericSetSparse4k) */
405 u64 valid_banks;
406 u64 masks[HV_VP_SET_BANK_COUNT_MAX];
407};
408
409/*
410 * flags for hv_device_interrupt_target.flags
411 */
412#define HV_DEVICE_INTERRUPT_TARGET_MULTICAST 1
413#define HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET 2
414
415struct hv_device_interrupt_target {
4daace0d
JO
416 u32 vector;
417 u32 flags;
7dcf90e9
JL
418 union {
419 u64 vp_mask;
420 struct hv_vp_set vp_set;
421 };
422};
423
424struct retarget_msi_interrupt {
425 u64 partition_id; /* use "self" */
426 u64 device_id;
427 struct hv_interrupt_entry int_entry;
428 u64 reserved2;
429 struct hv_device_interrupt_target int_target;
4daace0d
JO
430} __packed;
431
432/*
433 * Driver specific state.
434 */
435
436enum hv_pcibus_state {
437 hv_pcibus_init = 0,
438 hv_pcibus_probed,
439 hv_pcibus_installed,
d3a78d8b 440 hv_pcibus_removed,
4daace0d
JO
441 hv_pcibus_maximum
442};
443
444struct hv_pcibus_device {
445 struct pci_sysdata sysdata;
446 enum hv_pcibus_state state;
447 atomic_t remove_lock;
448 struct hv_device *hdev;
449 resource_size_t low_mmio_space;
450 resource_size_t high_mmio_space;
451 struct resource *mem_config;
452 struct resource *low_mmio_res;
453 struct resource *high_mmio_res;
454 struct completion *survey_event;
455 struct completion remove_event;
456 struct pci_bus *pci_bus;
457 spinlock_t config_lock; /* Avoid two threads writing index page */
458 spinlock_t device_list_lock; /* Protect lists below */
459 void __iomem *cfg_addr;
460
4daace0d
JO
461 struct list_head resources_for_children;
462
463 struct list_head children;
464 struct list_head dr_list;
4daace0d
JO
465
466 struct msi_domain_info msi_info;
467 struct msi_controller msi_chip;
468 struct irq_domain *irq_domain;
be66b673
JL
469
470 /* hypercall arg, must not cross page boundary */
0de8ce3e 471 struct retarget_msi_interrupt retarget_msi_interrupt_params;
be66b673 472
0de8ce3e 473 spinlock_t retarget_msi_interrupt_lock;
9053ead2
DC
474
475 struct workqueue_struct *wq;
4daace0d
JO
476};
477
478/*
479 * Tracks "Device Relations" messages from the host, which must be both
480 * processed in order and deferred so that they don't run in the context
481 * of the incoming packet callback.
482 */
483struct hv_dr_work {
484 struct work_struct wrk;
485 struct hv_pcibus_device *bus;
486};
487
488struct hv_dr_state {
489 struct list_head list_entry;
490 u32 device_count;
7d0f8eec 491 struct pci_function_description func[0];
4daace0d
JO
492};
493
494enum hv_pcichild_state {
495 hv_pcichild_init = 0,
496 hv_pcichild_requirements,
497 hv_pcichild_resourced,
498 hv_pcichild_ejecting,
499 hv_pcichild_maximum
500};
501
502enum hv_pcidev_ref_reason {
503 hv_pcidev_ref_invalid = 0,
504 hv_pcidev_ref_initial,
505 hv_pcidev_ref_by_slot,
506 hv_pcidev_ref_packet,
507 hv_pcidev_ref_pnp,
508 hv_pcidev_ref_childlist,
509 hv_pcidev_irqdata,
510 hv_pcidev_ref_max
511};
512
513struct hv_pci_dev {
514 /* List protected by pci_rescan_remove_lock */
515 struct list_head list_entry;
24196f0c 516 refcount_t refs;
4daace0d
JO
517 enum hv_pcichild_state state;
518 struct pci_function_description desc;
519 bool reported_missing;
520 struct hv_pcibus_device *hbus;
521 struct work_struct wrk;
522
523 /*
524 * What would be observed if one wrote 0xFFFFFFFF to a BAR and then
525 * read it back, for each of the BAR offsets within config space.
526 */
527 u32 probed_bar[6];
528};
529
530struct hv_pci_compl {
531 struct completion host_event;
532 s32 completion_status;
533};
534
67deffa2
DC
535static void hv_pci_onchannelcallback(void *context);
536
4daace0d
JO
537/**
538 * hv_pci_generic_compl() - Invoked for a completion packet
539 * @context: Set up by the sender of the packet.
540 * @resp: The response packet
541 * @resp_packet_size: Size in bytes of the packet
542 *
543 * This function is used to trigger an event and report status
544 * for any message for which the completion packet contains a
545 * status and nothing else.
546 */
a5b45b7b
DC
547static void hv_pci_generic_compl(void *context, struct pci_response *resp,
548 int resp_packet_size)
4daace0d
JO
549{
550 struct hv_pci_compl *comp_pkt = context;
551
552 if (resp_packet_size >= offsetofend(struct pci_response, status))
553 comp_pkt->completion_status = resp->status;
a5b45b7b
DC
554 else
555 comp_pkt->completion_status = -1;
556
4daace0d
JO
557 complete(&comp_pkt->host_event);
558}
559
560static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus,
561 u32 wslot);
562static void get_pcichild(struct hv_pci_dev *hv_pcidev,
563 enum hv_pcidev_ref_reason reason);
564static void put_pcichild(struct hv_pci_dev *hv_pcidev,
565 enum hv_pcidev_ref_reason reason);
566
567static void get_hvpcibus(struct hv_pcibus_device *hv_pcibus);
568static void put_hvpcibus(struct hv_pcibus_device *hv_pcibus);
569
5e10a2fa
DC
570/*
571 * There is no good way to get notified from vmbus_onoffer_rescind(),
572 * so let's use polling here, since this is not a hot path.
573 */
574static int wait_for_response(struct hv_device *hdev,
575 struct completion *comp)
576{
577 while (true) {
578 if (hdev->channel->rescind) {
579 dev_warn_once(&hdev->device, "The device is gone.\n");
580 return -ENODEV;
581 }
582
583 if (wait_for_completion_timeout(comp, HZ / 10))
584 break;
585 }
586
587 return 0;
588}
589
4daace0d
JO
590/**
591 * devfn_to_wslot() - Convert from Linux PCI slot to Windows
592 * @devfn: The Linux representation of PCI slot
593 *
594 * Windows uses a slightly different representation of PCI slot.
595 *
596 * Return: The Windows representation
597 */
598static u32 devfn_to_wslot(int devfn)
599{
600 union win_slot_encoding wslot;
601
602 wslot.slot = 0;
60e2e2fb
DC
603 wslot.bits.dev = PCI_SLOT(devfn);
604 wslot.bits.func = PCI_FUNC(devfn);
4daace0d
JO
605
606 return wslot.slot;
607}
608
609/**
610 * wslot_to_devfn() - Convert from Windows PCI slot to Linux
611 * @wslot: The Windows representation of PCI slot
612 *
613 * Windows uses a slightly different representation of PCI slot.
614 *
615 * Return: The Linux representation
616 */
617static int wslot_to_devfn(u32 wslot)
618{
619 union win_slot_encoding slot_no;
620
621 slot_no.slot = wslot;
60e2e2fb 622 return PCI_DEVFN(slot_no.bits.dev, slot_no.bits.func);
4daace0d
JO
623}
624
625/*
626 * PCI Configuration Space for these root PCI buses is implemented as a pair
627 * of pages in memory-mapped I/O space. Writing to the first page chooses
628 * the PCI function being written or read. Once the first page has been
629 * written to, the following page maps in the entire configuration space of
630 * the function.
631 */
632
633/**
634 * _hv_pcifront_read_config() - Internal PCI config read
635 * @hpdev: The PCI driver's representation of the device
636 * @where: Offset within config space
637 * @size: Size of the transfer
638 * @val: Pointer to the buffer receiving the data
639 */
640static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where,
641 int size, u32 *val)
642{
643 unsigned long flags;
644 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
645
646 /*
647 * If the attempt is to read the IDs or the ROM BAR, simulate that.
648 */
649 if (where + size <= PCI_COMMAND) {
650 memcpy(val, ((u8 *)&hpdev->desc.v_id) + where, size);
651 } else if (where >= PCI_CLASS_REVISION && where + size <=
652 PCI_CACHE_LINE_SIZE) {
653 memcpy(val, ((u8 *)&hpdev->desc.rev) + where -
654 PCI_CLASS_REVISION, size);
655 } else if (where >= PCI_SUBSYSTEM_VENDOR_ID && where + size <=
656 PCI_ROM_ADDRESS) {
657 memcpy(val, (u8 *)&hpdev->desc.subsystem_id + where -
658 PCI_SUBSYSTEM_VENDOR_ID, size);
659 } else if (where >= PCI_ROM_ADDRESS && where + size <=
660 PCI_CAPABILITY_LIST) {
661 /* ROM BARs are unimplemented */
662 *val = 0;
663 } else if (where >= PCI_INTERRUPT_LINE && where + size <=
664 PCI_INTERRUPT_PIN) {
665 /*
666 * Interrupt Line and Interrupt PIN are hard-wired to zero
667 * because this front-end only supports message-signaled
668 * interrupts.
669 */
670 *val = 0;
671 } else if (where + size <= CFG_PAGE_SIZE) {
672 spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
673 /* Choose the function to be read. (See comment above) */
674 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
bdd74440
VK
675 /* Make sure the function was chosen before we start reading. */
676 mb();
4daace0d
JO
677 /* Read from that function's config space. */
678 switch (size) {
679 case 1:
680 *val = readb(addr);
681 break;
682 case 2:
683 *val = readw(addr);
684 break;
685 default:
686 *val = readl(addr);
687 break;
688 }
bdd74440 689 /*
a243ee0f 690 * Make sure the read was done before we release the spinlock
bdd74440
VK
691 * allowing consecutive reads/writes.
692 */
693 mb();
4daace0d
JO
694 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
695 } else {
696 dev_err(&hpdev->hbus->hdev->device,
697 "Attempt to read beyond a function's config space.\n");
698 }
699}
700
67deffa2
DC
701static u16 hv_pcifront_get_vendor_id(struct hv_pci_dev *hpdev)
702{
703 u16 ret;
704 unsigned long flags;
705 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET +
706 PCI_VENDOR_ID;
707
708 spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
709
710 /* Choose the function to be read. (See comment above) */
711 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
712 /* Make sure the function was chosen before we start reading. */
713 mb();
714 /* Read from that function's config space. */
715 ret = readw(addr);
716 /*
717 * mb() is not required here, because the spin_unlock_irqrestore()
718 * is a barrier.
719 */
720
721 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
722
723 return ret;
724}
725
4daace0d
JO
726/**
727 * _hv_pcifront_write_config() - Internal PCI config write
728 * @hpdev: The PCI driver's representation of the device
729 * @where: Offset within config space
730 * @size: Size of the transfer
731 * @val: The data being transferred
732 */
733static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where,
734 int size, u32 val)
735{
736 unsigned long flags;
737 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
738
739 if (where >= PCI_SUBSYSTEM_VENDOR_ID &&
740 where + size <= PCI_CAPABILITY_LIST) {
741 /* SSIDs and ROM BARs are read-only */
742 } else if (where >= PCI_COMMAND && where + size <= CFG_PAGE_SIZE) {
743 spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
744 /* Choose the function to be written. (See comment above) */
745 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
bdd74440
VK
746 /* Make sure the function was chosen before we start writing. */
747 wmb();
4daace0d
JO
748 /* Write to that function's config space. */
749 switch (size) {
750 case 1:
751 writeb(val, addr);
752 break;
753 case 2:
754 writew(val, addr);
755 break;
756 default:
757 writel(val, addr);
758 break;
759 }
bdd74440
VK
760 /*
761 * Make sure the write was done before we release the spinlock
762 * allowing consecutive reads/writes.
763 */
764 mb();
4daace0d
JO
765 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
766 } else {
767 dev_err(&hpdev->hbus->hdev->device,
768 "Attempt to write beyond a function's config space.\n");
769 }
770}
771
772/**
773 * hv_pcifront_read_config() - Read configuration space
774 * @bus: PCI Bus structure
775 * @devfn: Device/function
776 * @where: Offset from base
777 * @size: Byte/word/dword
778 * @val: Value to be read
779 *
780 * Return: PCIBIOS_SUCCESSFUL on success
781 * PCIBIOS_DEVICE_NOT_FOUND on failure
782 */
783static int hv_pcifront_read_config(struct pci_bus *bus, unsigned int devfn,
784 int where, int size, u32 *val)
785{
786 struct hv_pcibus_device *hbus =
787 container_of(bus->sysdata, struct hv_pcibus_device, sysdata);
788 struct hv_pci_dev *hpdev;
789
790 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn));
791 if (!hpdev)
792 return PCIBIOS_DEVICE_NOT_FOUND;
793
794 _hv_pcifront_read_config(hpdev, where, size, val);
795
796 put_pcichild(hpdev, hv_pcidev_ref_by_slot);
797 return PCIBIOS_SUCCESSFUL;
798}
799
800/**
801 * hv_pcifront_write_config() - Write configuration space
802 * @bus: PCI Bus structure
803 * @devfn: Device/function
804 * @where: Offset from base
805 * @size: Byte/word/dword
806 * @val: Value to be written to device
807 *
808 * Return: PCIBIOS_SUCCESSFUL on success
809 * PCIBIOS_DEVICE_NOT_FOUND on failure
810 */
811static int hv_pcifront_write_config(struct pci_bus *bus, unsigned int devfn,
812 int where, int size, u32 val)
813{
814 struct hv_pcibus_device *hbus =
815 container_of(bus->sysdata, struct hv_pcibus_device, sysdata);
816 struct hv_pci_dev *hpdev;
817
818 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn));
819 if (!hpdev)
820 return PCIBIOS_DEVICE_NOT_FOUND;
821
822 _hv_pcifront_write_config(hpdev, where, size, val);
823
824 put_pcichild(hpdev, hv_pcidev_ref_by_slot);
825 return PCIBIOS_SUCCESSFUL;
826}
827
828/* PCIe operations */
829static struct pci_ops hv_pcifront_ops = {
830 .read = hv_pcifront_read_config,
831 .write = hv_pcifront_write_config,
832};
833
834/* Interrupt management hooks */
835static void hv_int_desc_free(struct hv_pci_dev *hpdev,
836 struct tran_int_desc *int_desc)
837{
838 struct pci_delete_interrupt *int_pkt;
839 struct {
840 struct pci_packet pkt;
0c6045d8 841 u8 buffer[sizeof(struct pci_delete_interrupt)];
4daace0d
JO
842 } ctxt;
843
844 memset(&ctxt, 0, sizeof(ctxt));
845 int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message;
0c6045d8 846 int_pkt->message_type.type =
4daace0d
JO
847 PCI_DELETE_INTERRUPT_MESSAGE;
848 int_pkt->wslot.slot = hpdev->desc.win_slot.slot;
849 int_pkt->int_desc = *int_desc;
850 vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt, sizeof(*int_pkt),
851 (unsigned long)&ctxt.pkt, VM_PKT_DATA_INBAND, 0);
852 kfree(int_desc);
853}
854
855/**
856 * hv_msi_free() - Free the MSI.
857 * @domain: The interrupt domain pointer
858 * @info: Extra MSI-related context
859 * @irq: Identifies the IRQ.
860 *
861 * The Hyper-V parent partition and hypervisor are tracking the
862 * messages that are in use, keeping the interrupt redirection
863 * table up to date. This callback sends a message that frees
864 * the IRT entry and related tracking nonsense.
865 */
866static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info,
867 unsigned int irq)
868{
869 struct hv_pcibus_device *hbus;
870 struct hv_pci_dev *hpdev;
871 struct pci_dev *pdev;
872 struct tran_int_desc *int_desc;
873 struct irq_data *irq_data = irq_domain_get_irq_data(domain, irq);
874 struct msi_desc *msi = irq_data_get_msi_desc(irq_data);
875
876 pdev = msi_desc_to_pci_dev(msi);
877 hbus = info->data;
0c6e617f
CA
878 int_desc = irq_data_get_irq_chip_data(irq_data);
879 if (!int_desc)
4daace0d
JO
880 return;
881
0c6e617f
CA
882 irq_data->chip_data = NULL;
883 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
884 if (!hpdev) {
885 kfree(int_desc);
886 return;
4daace0d
JO
887 }
888
0c6e617f 889 hv_int_desc_free(hpdev, int_desc);
4daace0d
JO
890 put_pcichild(hpdev, hv_pcidev_ref_by_slot);
891}
892
893static int hv_set_affinity(struct irq_data *data, const struct cpumask *dest,
894 bool force)
895{
896 struct irq_data *parent = data->parent_data;
897
898 return parent->chip->irq_set_affinity(parent, dest, force);
899}
900
542ccf45 901static void hv_irq_mask(struct irq_data *data)
4daace0d
JO
902{
903 pci_msi_mask_irq(data);
904}
905
906/**
907 * hv_irq_unmask() - "Unmask" the IRQ by setting its current
908 * affinity.
909 * @data: Describes the IRQ
910 *
911 * Build new a destination for the MSI and make a hypercall to
912 * update the Interrupt Redirection Table. "Device Logical ID"
913 * is built out of this PCI bus's instance GUID and the function
914 * number of the device.
915 */
542ccf45 916static void hv_irq_unmask(struct irq_data *data)
4daace0d
JO
917{
918 struct msi_desc *msi_desc = irq_data_get_msi_desc(data);
919 struct irq_cfg *cfg = irqd_cfg(data);
0de8ce3e 920 struct retarget_msi_interrupt *params;
4daace0d
JO
921 struct hv_pcibus_device *hbus;
922 struct cpumask *dest;
923 struct pci_bus *pbus;
924 struct pci_dev *pdev;
0de8ce3e 925 unsigned long flags;
7dcf90e9
JL
926 u32 var_size = 0;
927 int cpu_vmbus;
928 int cpu;
929 u64 res;
4daace0d 930
79aa801e 931 dest = irq_data_get_effective_affinity_mask(data);
4daace0d
JO
932 pdev = msi_desc_to_pci_dev(msi_desc);
933 pbus = pdev->bus;
934 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
935
0de8ce3e
LL
936 spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags);
937
938 params = &hbus->retarget_msi_interrupt_params;
939 memset(params, 0, sizeof(*params));
940 params->partition_id = HV_PARTITION_ID_SELF;
7dcf90e9
JL
941 params->int_entry.source = 1; /* MSI(-X) */
942 params->int_entry.address = msi_desc->msg.address_lo;
943 params->int_entry.data = msi_desc->msg.data;
0de8ce3e 944 params->device_id = (hbus->hdev->dev_instance.b[5] << 24) |
4daace0d
JO
945 (hbus->hdev->dev_instance.b[4] << 16) |
946 (hbus->hdev->dev_instance.b[7] << 8) |
947 (hbus->hdev->dev_instance.b[6] & 0xf8) |
948 PCI_FUNC(pdev->devfn);
7dcf90e9
JL
949 params->int_target.vector = cfg->vector;
950
951 /*
952 * Honoring apic->irq_delivery_mode set to dest_Fixed by
953 * setting the HV_DEVICE_INTERRUPT_TARGET_MULTICAST flag results in a
954 * spurious interrupt storm. Not doing so does not seem to have a
955 * negative effect (yet?).
956 */
957
958 if (pci_protocol_version >= PCI_PROTOCOL_VERSION_1_2) {
959 /*
960 * PCI_PROTOCOL_VERSION_1_2 supports the VP_SET version of the
961 * HVCALL_RETARGET_INTERRUPT hypercall, which also coincides
962 * with >64 VP support.
963 * ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED
964 * is not sufficient for this hypercall.
965 */
966 params->int_target.flags |=
967 HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET;
968 params->int_target.vp_set.valid_banks =
969 (1ull << HV_VP_SET_BANK_COUNT_MAX) - 1;
970
971 /*
972 * var-sized hypercall, var-size starts after vp_mask (thus
973 * vp_set.format does not count, but vp_set.valid_banks does).
974 */
975 var_size = 1 + HV_VP_SET_BANK_COUNT_MAX;
976
977 for_each_cpu_and(cpu, dest, cpu_online_mask) {
7415aea6 978 cpu_vmbus = hv_cpu_number_to_vp_number(cpu);
7dcf90e9
JL
979
980 if (cpu_vmbus >= HV_VP_SET_BANK_COUNT_MAX * 64) {
981 dev_err(&hbus->hdev->device,
982 "too high CPU %d", cpu_vmbus);
983 res = 1;
984 goto exit_unlock;
985 }
4daace0d 986
7dcf90e9
JL
987 params->int_target.vp_set.masks[cpu_vmbus / 64] |=
988 (1ULL << (cpu_vmbus & 63));
989 }
990 } else {
991 for_each_cpu_and(cpu, dest, cpu_online_mask) {
992 params->int_target.vp_mask |=
7415aea6 993 (1ULL << hv_cpu_number_to_vp_number(cpu));
7dcf90e9
JL
994 }
995 }
0de8ce3e 996
7dcf90e9
JL
997 res = hv_do_hypercall(HVCALL_RETARGET_INTERRUPT | (var_size << 17),
998 params, NULL);
4daace0d 999
7dcf90e9 1000exit_unlock:
0de8ce3e 1001 spin_unlock_irqrestore(&hbus->retarget_msi_interrupt_lock, flags);
4daace0d 1002
7dcf90e9
JL
1003 if (res) {
1004 dev_err(&hbus->hdev->device,
1005 "%s() failed: %#llx", __func__, res);
1006 return;
1007 }
1008
4daace0d
JO
1009 pci_msi_unmask_irq(data);
1010}
1011
1012struct compose_comp_ctxt {
1013 struct hv_pci_compl comp_pkt;
1014 struct tran_int_desc int_desc;
1015};
1016
1017static void hv_pci_compose_compl(void *context, struct pci_response *resp,
1018 int resp_packet_size)
1019{
1020 struct compose_comp_ctxt *comp_pkt = context;
1021 struct pci_create_int_response *int_resp =
1022 (struct pci_create_int_response *)resp;
1023
1024 comp_pkt->comp_pkt.completion_status = resp->status;
1025 comp_pkt->int_desc = int_resp->int_desc;
1026 complete(&comp_pkt->comp_pkt.host_event);
1027}
1028
7dcf90e9
JL
1029static u32 hv_compose_msi_req_v1(
1030 struct pci_create_interrupt *int_pkt, struct cpumask *affinity,
1031 u32 slot, u8 vector)
1032{
1033 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE;
1034 int_pkt->wslot.slot = slot;
1035 int_pkt->int_desc.vector = vector;
1036 int_pkt->int_desc.vector_count = 1;
a31e58e1 1037 int_pkt->int_desc.delivery_mode = dest_Fixed;
7dcf90e9
JL
1038
1039 /*
1040 * Create MSI w/ dummy vCPU set, overwritten by subsequent retarget in
1041 * hv_irq_unmask().
1042 */
1043 int_pkt->int_desc.cpu_mask = CPU_AFFINITY_ALL;
1044
1045 return sizeof(*int_pkt);
1046}
1047
1048static u32 hv_compose_msi_req_v2(
1049 struct pci_create_interrupt2 *int_pkt, struct cpumask *affinity,
1050 u32 slot, u8 vector)
1051{
1052 int cpu;
1053
1054 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2;
1055 int_pkt->wslot.slot = slot;
1056 int_pkt->int_desc.vector = vector;
1057 int_pkt->int_desc.vector_count = 1;
a31e58e1 1058 int_pkt->int_desc.delivery_mode = dest_Fixed;
7dcf90e9
JL
1059
1060 /*
1061 * Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten
1062 * by subsequent retarget in hv_irq_unmask().
1063 */
1064 cpu = cpumask_first_and(affinity, cpu_online_mask);
1065 int_pkt->int_desc.processor_array[0] =
7415aea6 1066 hv_cpu_number_to_vp_number(cpu);
7dcf90e9
JL
1067 int_pkt->int_desc.processor_count = 1;
1068
1069 return sizeof(*int_pkt);
1070}
1071
4daace0d
JO
1072/**
1073 * hv_compose_msi_msg() - Supplies a valid MSI address/data
1074 * @data: Everything about this MSI
1075 * @msg: Buffer that is filled in by this function
1076 *
1077 * This function unpacks the IRQ looking for target CPU set, IDT
1078 * vector and mode and sends a message to the parent partition
1079 * asking for a mapping for that tuple in this partition. The
1080 * response supplies a data value and address to which that data
1081 * should be written to trigger that interrupt.
1082 */
1083static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
1084{
1085 struct irq_cfg *cfg = irqd_cfg(data);
1086 struct hv_pcibus_device *hbus;
1087 struct hv_pci_dev *hpdev;
1088 struct pci_bus *pbus;
1089 struct pci_dev *pdev;
79aa801e 1090 struct cpumask *dest;
4daace0d
JO
1091 struct compose_comp_ctxt comp;
1092 struct tran_int_desc *int_desc;
4daace0d 1093 struct {
7dcf90e9
JL
1094 struct pci_packet pci_pkt;
1095 union {
1096 struct pci_create_interrupt v1;
1097 struct pci_create_interrupt2 v2;
1098 } int_pkts;
1099 } __packed ctxt;
1100
1101 u32 size;
4daace0d
JO
1102 int ret;
1103
1104 pdev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data));
79aa801e 1105 dest = irq_data_get_effective_affinity_mask(data);
4daace0d
JO
1106 pbus = pdev->bus;
1107 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
1108 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
1109 if (!hpdev)
1110 goto return_null_message;
1111
1112 /* Free any previous message that might have already been composed. */
1113 if (data->chip_data) {
1114 int_desc = data->chip_data;
1115 data->chip_data = NULL;
1116 hv_int_desc_free(hpdev, int_desc);
1117 }
1118
59c58cee 1119 int_desc = kzalloc(sizeof(*int_desc), GFP_ATOMIC);
4daace0d
JO
1120 if (!int_desc)
1121 goto drop_reference;
1122
1123 memset(&ctxt, 0, sizeof(ctxt));
1124 init_completion(&comp.comp_pkt.host_event);
7dcf90e9
JL
1125 ctxt.pci_pkt.completion_func = hv_pci_compose_compl;
1126 ctxt.pci_pkt.compl_ctxt = &comp;
1127
1128 switch (pci_protocol_version) {
1129 case PCI_PROTOCOL_VERSION_1_1:
1130 size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1,
79aa801e 1131 dest,
7dcf90e9
JL
1132 hpdev->desc.win_slot.slot,
1133 cfg->vector);
1134 break;
4daace0d 1135
7dcf90e9
JL
1136 case PCI_PROTOCOL_VERSION_1_2:
1137 size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2,
79aa801e 1138 dest,
7dcf90e9
JL
1139 hpdev->desc.win_slot.slot,
1140 cfg->vector);
1141 break;
1142
1143 default:
1144 /* As we only negotiate protocol versions known to this driver,
1145 * this path should never hit. However, this is it not a hot
1146 * path so we print a message to aid future updates.
1147 */
1148 dev_err(&hbus->hdev->device,
1149 "Unexpected vPCI protocol, update driver.");
1150 goto free_int_desc;
4daace0d
JO
1151 }
1152
7dcf90e9
JL
1153 ret = vmbus_sendpacket(hpdev->hbus->hdev->channel, &ctxt.int_pkts,
1154 size, (unsigned long)&ctxt.pci_pkt,
4daace0d
JO
1155 VM_PKT_DATA_INBAND,
1156 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
7dcf90e9
JL
1157 if (ret) {
1158 dev_err(&hbus->hdev->device,
1159 "Sending request for interrupt failed: 0x%x",
1160 comp.comp_pkt.completion_status);
665e2245 1161 goto free_int_desc;
7dcf90e9 1162 }
665e2245 1163
80bfeeb9
SH
1164 /*
1165 * Since this function is called with IRQ locks held, can't
1166 * do normal wait for completion; instead poll.
1167 */
67deffa2
DC
1168 while (!try_wait_for_completion(&comp.comp_pkt.host_event)) {
1169 /* 0xFFFF means an invalid PCI VENDOR ID. */
1170 if (hv_pcifront_get_vendor_id(hpdev) == 0xFFFF) {
1171 dev_err_once(&hbus->hdev->device,
1172 "the device has gone\n");
1173 goto free_int_desc;
1174 }
1175
1176 /*
1177 * When the higher level interrupt code calls us with
1178 * interrupt disabled, we must poll the channel by calling
1179 * the channel callback directly when channel->target_cpu is
1180 * the current CPU. When the higher level interrupt code
1181 * calls us with interrupt enabled, let's add the
1182 * local_bh_disable()/enable() to avoid race.
1183 */
1184 local_bh_disable();
1185
1186 if (hbus->hdev->channel->target_cpu == smp_processor_id())
1187 hv_pci_onchannelcallback(hbus);
1188
1189 local_bh_enable();
1190
1191 if (hpdev->state == hv_pcichild_ejecting) {
1192 dev_err_once(&hbus->hdev->device,
1193 "the device is being ejected\n");
1194 goto free_int_desc;
1195 }
1196
80bfeeb9 1197 udelay(100);
67deffa2 1198 }
4daace0d
JO
1199
1200 if (comp.comp_pkt.completion_status < 0) {
1201 dev_err(&hbus->hdev->device,
1202 "Request for interrupt failed: 0x%x",
1203 comp.comp_pkt.completion_status);
1204 goto free_int_desc;
1205 }
1206
1207 /*
1208 * Record the assignment so that this can be unwound later. Using
1209 * irq_set_chip_data() here would be appropriate, but the lock it takes
1210 * is already held.
1211 */
1212 *int_desc = comp.int_desc;
1213 data->chip_data = int_desc;
1214
1215 /* Pass up the result. */
1216 msg->address_hi = comp.int_desc.address >> 32;
1217 msg->address_lo = comp.int_desc.address & 0xffffffff;
1218 msg->data = comp.int_desc.data;
1219
1220 put_pcichild(hpdev, hv_pcidev_ref_by_slot);
1221 return;
1222
1223free_int_desc:
1224 kfree(int_desc);
1225drop_reference:
1226 put_pcichild(hpdev, hv_pcidev_ref_by_slot);
1227return_null_message:
1228 msg->address_hi = 0;
1229 msg->address_lo = 0;
1230 msg->data = 0;
1231}
1232
1233/* HW Interrupt Chip Descriptor */
1234static struct irq_chip hv_msi_irq_chip = {
1235 .name = "Hyper-V PCIe MSI",
1236 .irq_compose_msi_msg = hv_compose_msi_msg,
1237 .irq_set_affinity = hv_set_affinity,
1238 .irq_ack = irq_chip_ack_parent,
1239 .irq_mask = hv_irq_mask,
1240 .irq_unmask = hv_irq_unmask,
1241};
1242
1243static irq_hw_number_t hv_msi_domain_ops_get_hwirq(struct msi_domain_info *info,
1244 msi_alloc_info_t *arg)
1245{
1246 return arg->msi_hwirq;
1247}
1248
1249static struct msi_domain_ops hv_msi_ops = {
1250 .get_hwirq = hv_msi_domain_ops_get_hwirq,
1251 .msi_prepare = pci_msi_prepare,
1252 .set_desc = pci_msi_set_desc,
1253 .msi_free = hv_msi_free,
1254};
1255
1256/**
1257 * hv_pcie_init_irq_domain() - Initialize IRQ domain
1258 * @hbus: The root PCI bus
1259 *
1260 * This function creates an IRQ domain which will be used for
1261 * interrupts from devices that have been passed through. These
1262 * devices only support MSI and MSI-X, not line-based interrupts
1263 * or simulations of line-based interrupts through PCIe's
1264 * fabric-layer messages. Because interrupts are remapped, we
1265 * can support multi-message MSI here.
1266 *
1267 * Return: '0' on success and error value on failure
1268 */
1269static int hv_pcie_init_irq_domain(struct hv_pcibus_device *hbus)
1270{
1271 hbus->msi_info.chip = &hv_msi_irq_chip;
1272 hbus->msi_info.ops = &hv_msi_ops;
1273 hbus->msi_info.flags = (MSI_FLAG_USE_DEF_DOM_OPS |
1274 MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI |
1275 MSI_FLAG_PCI_MSIX);
1276 hbus->msi_info.handler = handle_edge_irq;
1277 hbus->msi_info.handler_name = "edge";
1278 hbus->msi_info.data = hbus;
1279 hbus->irq_domain = pci_msi_create_irq_domain(hbus->sysdata.fwnode,
1280 &hbus->msi_info,
1281 x86_vector_domain);
1282 if (!hbus->irq_domain) {
1283 dev_err(&hbus->hdev->device,
1284 "Failed to build an MSI IRQ domain\n");
1285 return -ENODEV;
1286 }
1287
1288 return 0;
1289}
1290
1291/**
1292 * get_bar_size() - Get the address space consumed by a BAR
1293 * @bar_val: Value that a BAR returned after -1 was written
1294 * to it.
1295 *
1296 * This function returns the size of the BAR, rounded up to 1
1297 * page. It has to be rounded up because the hypervisor's page
1298 * table entry that maps the BAR into the VM can't specify an
1299 * offset within a page. The invariant is that the hypervisor
1300 * must place any BARs of smaller than page length at the
1301 * beginning of a page.
1302 *
1303 * Return: Size in bytes of the consumed MMIO space.
1304 */
1305static u64 get_bar_size(u64 bar_val)
1306{
1307 return round_up((1 + ~(bar_val & PCI_BASE_ADDRESS_MEM_MASK)),
1308 PAGE_SIZE);
1309}
1310
1311/**
1312 * survey_child_resources() - Total all MMIO requirements
1313 * @hbus: Root PCI bus, as understood by this driver
1314 */
1315static void survey_child_resources(struct hv_pcibus_device *hbus)
1316{
1317 struct list_head *iter;
1318 struct hv_pci_dev *hpdev;
1319 resource_size_t bar_size = 0;
1320 unsigned long flags;
1321 struct completion *event;
1322 u64 bar_val;
1323 int i;
1324
1325 /* If nobody is waiting on the answer, don't compute it. */
1326 event = xchg(&hbus->survey_event, NULL);
1327 if (!event)
1328 return;
1329
1330 /* If the answer has already been computed, go with it. */
1331 if (hbus->low_mmio_space || hbus->high_mmio_space) {
1332 complete(event);
1333 return;
1334 }
1335
1336 spin_lock_irqsave(&hbus->device_list_lock, flags);
1337
1338 /*
1339 * Due to an interesting quirk of the PCI spec, all memory regions
1340 * for a child device are a power of 2 in size and aligned in memory,
1341 * so it's sufficient to just add them up without tracking alignment.
1342 */
1343 list_for_each(iter, &hbus->children) {
1344 hpdev = container_of(iter, struct hv_pci_dev, list_entry);
1345 for (i = 0; i < 6; i++) {
1346 if (hpdev->probed_bar[i] & PCI_BASE_ADDRESS_SPACE_IO)
1347 dev_err(&hbus->hdev->device,
1348 "There's an I/O BAR in this list!\n");
1349
1350 if (hpdev->probed_bar[i] != 0) {
1351 /*
1352 * A probed BAR has all the upper bits set that
1353 * can be changed.
1354 */
1355
1356 bar_val = hpdev->probed_bar[i];
1357 if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64)
1358 bar_val |=
1359 ((u64)hpdev->probed_bar[++i] << 32);
1360 else
1361 bar_val |= 0xffffffff00000000ULL;
1362
1363 bar_size = get_bar_size(bar_val);
1364
1365 if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64)
1366 hbus->high_mmio_space += bar_size;
1367 else
1368 hbus->low_mmio_space += bar_size;
1369 }
1370 }
1371 }
1372
1373 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1374 complete(event);
1375}
1376
1377/**
1378 * prepopulate_bars() - Fill in BARs with defaults
1379 * @hbus: Root PCI bus, as understood by this driver
1380 *
1381 * The core PCI driver code seems much, much happier if the BARs
1382 * for a device have values upon first scan. So fill them in.
1383 * The algorithm below works down from large sizes to small,
1384 * attempting to pack the assignments optimally. The assumption,
1385 * enforced in other parts of the code, is that the beginning of
1386 * the memory-mapped I/O space will be aligned on the largest
1387 * BAR size.
1388 */
1389static void prepopulate_bars(struct hv_pcibus_device *hbus)
1390{
1391 resource_size_t high_size = 0;
1392 resource_size_t low_size = 0;
1393 resource_size_t high_base = 0;
1394 resource_size_t low_base = 0;
1395 resource_size_t bar_size;
1396 struct hv_pci_dev *hpdev;
1397 struct list_head *iter;
1398 unsigned long flags;
1399 u64 bar_val;
1400 u32 command;
1401 bool high;
1402 int i;
1403
1404 if (hbus->low_mmio_space) {
1405 low_size = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space));
1406 low_base = hbus->low_mmio_res->start;
1407 }
1408
1409 if (hbus->high_mmio_space) {
1410 high_size = 1ULL <<
1411 (63 - __builtin_clzll(hbus->high_mmio_space));
1412 high_base = hbus->high_mmio_res->start;
1413 }
1414
1415 spin_lock_irqsave(&hbus->device_list_lock, flags);
1416
1417 /* Pick addresses for the BARs. */
1418 do {
1419 list_for_each(iter, &hbus->children) {
1420 hpdev = container_of(iter, struct hv_pci_dev,
1421 list_entry);
1422 for (i = 0; i < 6; i++) {
1423 bar_val = hpdev->probed_bar[i];
1424 if (bar_val == 0)
1425 continue;
1426 high = bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64;
1427 if (high) {
1428 bar_val |=
1429 ((u64)hpdev->probed_bar[i + 1]
1430 << 32);
1431 } else {
1432 bar_val |= 0xffffffffULL << 32;
1433 }
1434 bar_size = get_bar_size(bar_val);
1435 if (high) {
1436 if (high_size != bar_size) {
1437 i++;
1438 continue;
1439 }
1440 _hv_pcifront_write_config(hpdev,
1441 PCI_BASE_ADDRESS_0 + (4 * i),
1442 4,
1443 (u32)(high_base & 0xffffff00));
1444 i++;
1445 _hv_pcifront_write_config(hpdev,
1446 PCI_BASE_ADDRESS_0 + (4 * i),
1447 4, (u32)(high_base >> 32));
1448 high_base += bar_size;
1449 } else {
1450 if (low_size != bar_size)
1451 continue;
1452 _hv_pcifront_write_config(hpdev,
1453 PCI_BASE_ADDRESS_0 + (4 * i),
1454 4,
1455 (u32)(low_base & 0xffffff00));
1456 low_base += bar_size;
1457 }
1458 }
1459 if (high_size <= 1 && low_size <= 1) {
1460 /* Set the memory enable bit. */
1461 _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2,
1462 &command);
1463 command |= PCI_COMMAND_MEMORY;
1464 _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2,
1465 command);
1466 break;
1467 }
1468 }
1469
1470 high_size >>= 1;
1471 low_size >>= 1;
1472 } while (high_size || low_size);
1473
1474 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1475}
1476
1477/**
1478 * create_root_hv_pci_bus() - Expose a new root PCI bus
1479 * @hbus: Root PCI bus, as understood by this driver
1480 *
1481 * Return: 0 on success, -errno on failure
1482 */
1483static int create_root_hv_pci_bus(struct hv_pcibus_device *hbus)
1484{
1485 /* Register the device */
1486 hbus->pci_bus = pci_create_root_bus(&hbus->hdev->device,
1487 0, /* bus number is always zero */
1488 &hv_pcifront_ops,
1489 &hbus->sysdata,
1490 &hbus->resources_for_children);
1491 if (!hbus->pci_bus)
1492 return -ENODEV;
1493
1494 hbus->pci_bus->msi = &hbus->msi_chip;
1495 hbus->pci_bus->msi->dev = &hbus->hdev->device;
1496
414428c5 1497 pci_lock_rescan_remove();
4daace0d
JO
1498 pci_scan_child_bus(hbus->pci_bus);
1499 pci_bus_assign_resources(hbus->pci_bus);
1500 pci_bus_add_devices(hbus->pci_bus);
414428c5 1501 pci_unlock_rescan_remove();
4daace0d
JO
1502 hbus->state = hv_pcibus_installed;
1503 return 0;
1504}
1505
1506struct q_res_req_compl {
1507 struct completion host_event;
1508 struct hv_pci_dev *hpdev;
1509};
1510
1511/**
1512 * q_resource_requirements() - Query Resource Requirements
1513 * @context: The completion context.
1514 * @resp: The response that came from the host.
1515 * @resp_packet_size: The size in bytes of resp.
1516 *
1517 * This function is invoked on completion of a Query Resource
1518 * Requirements packet.
1519 */
1520static void q_resource_requirements(void *context, struct pci_response *resp,
1521 int resp_packet_size)
1522{
1523 struct q_res_req_compl *completion = context;
1524 struct pci_q_res_req_response *q_res_req =
1525 (struct pci_q_res_req_response *)resp;
1526 int i;
1527
1528 if (resp->status < 0) {
1529 dev_err(&completion->hpdev->hbus->hdev->device,
1530 "query resource requirements failed: %x\n",
1531 resp->status);
1532 } else {
1533 for (i = 0; i < 6; i++) {
1534 completion->hpdev->probed_bar[i] =
1535 q_res_req->probed_bar[i];
1536 }
1537 }
1538
1539 complete(&completion->host_event);
1540}
1541
1542static void get_pcichild(struct hv_pci_dev *hpdev,
1543 enum hv_pcidev_ref_reason reason)
1544{
24196f0c 1545 refcount_inc(&hpdev->refs);
4daace0d
JO
1546}
1547
1548static void put_pcichild(struct hv_pci_dev *hpdev,
1549 enum hv_pcidev_ref_reason reason)
1550{
24196f0c 1551 if (refcount_dec_and_test(&hpdev->refs))
4daace0d
JO
1552 kfree(hpdev);
1553}
1554
1555/**
1556 * new_pcichild_device() - Create a new child device
1557 * @hbus: The internal struct tracking this root PCI bus.
1558 * @desc: The information supplied so far from the host
1559 * about the device.
1560 *
1561 * This function creates the tracking structure for a new child
1562 * device and kicks off the process of figuring out what it is.
1563 *
1564 * Return: Pointer to the new tracking struct
1565 */
1566static struct hv_pci_dev *new_pcichild_device(struct hv_pcibus_device *hbus,
1567 struct pci_function_description *desc)
1568{
1569 struct hv_pci_dev *hpdev;
1570 struct pci_child_message *res_req;
1571 struct q_res_req_compl comp_pkt;
8286e96d
DC
1572 struct {
1573 struct pci_packet init_packet;
1574 u8 buffer[sizeof(struct pci_child_message)];
4daace0d
JO
1575 } pkt;
1576 unsigned long flags;
1577 int ret;
1578
1579 hpdev = kzalloc(sizeof(*hpdev), GFP_ATOMIC);
1580 if (!hpdev)
1581 return NULL;
1582
1583 hpdev->hbus = hbus;
1584
1585 memset(&pkt, 0, sizeof(pkt));
1586 init_completion(&comp_pkt.host_event);
1587 comp_pkt.hpdev = hpdev;
1588 pkt.init_packet.compl_ctxt = &comp_pkt;
1589 pkt.init_packet.completion_func = q_resource_requirements;
1590 res_req = (struct pci_child_message *)&pkt.init_packet.message;
0c6045d8 1591 res_req->message_type.type = PCI_QUERY_RESOURCE_REQUIREMENTS;
4daace0d
JO
1592 res_req->wslot.slot = desc->win_slot.slot;
1593
1594 ret = vmbus_sendpacket(hbus->hdev->channel, res_req,
1595 sizeof(struct pci_child_message),
1596 (unsigned long)&pkt.init_packet,
1597 VM_PKT_DATA_INBAND,
1598 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
1599 if (ret)
1600 goto error;
1601
5e10a2fa
DC
1602 if (wait_for_response(hbus->hdev, &comp_pkt.host_event))
1603 goto error;
4daace0d
JO
1604
1605 hpdev->desc = *desc;
24196f0c 1606 refcount_set(&hpdev->refs, 1);
4daace0d
JO
1607 get_pcichild(hpdev, hv_pcidev_ref_childlist);
1608 spin_lock_irqsave(&hbus->device_list_lock, flags);
4a9b0933 1609
4daace0d
JO
1610 list_add_tail(&hpdev->list_entry, &hbus->children);
1611 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1612 return hpdev;
1613
1614error:
1615 kfree(hpdev);
1616 return NULL;
1617}
1618
1619/**
1620 * get_pcichild_wslot() - Find device from slot
1621 * @hbus: Root PCI bus, as understood by this driver
1622 * @wslot: Location on the bus
1623 *
1624 * This function looks up a PCI device and returns the internal
1625 * representation of it. It acquires a reference on it, so that
1626 * the device won't be deleted while somebody is using it. The
1627 * caller is responsible for calling put_pcichild() to release
1628 * this reference.
1629 *
1630 * Return: Internal representation of a PCI device
1631 */
1632static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus,
1633 u32 wslot)
1634{
1635 unsigned long flags;
1636 struct hv_pci_dev *iter, *hpdev = NULL;
1637
1638 spin_lock_irqsave(&hbus->device_list_lock, flags);
1639 list_for_each_entry(iter, &hbus->children, list_entry) {
1640 if (iter->desc.win_slot.slot == wslot) {
1641 hpdev = iter;
1642 get_pcichild(hpdev, hv_pcidev_ref_by_slot);
1643 break;
1644 }
1645 }
1646 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1647
1648 return hpdev;
1649}
1650
1651/**
1652 * pci_devices_present_work() - Handle new list of child devices
1653 * @work: Work struct embedded in struct hv_dr_work
1654 *
1655 * "Bus Relations" is the Windows term for "children of this
1656 * bus." The terminology is preserved here for people trying to
1657 * debug the interaction between Hyper-V and Linux. This
1658 * function is called when the parent partition reports a list
1659 * of functions that should be observed under this PCI Express
1660 * port (bus).
1661 *
1662 * This function updates the list, and must tolerate being
1663 * called multiple times with the same information. The typical
1664 * number of child devices is one, with very atypical cases
1665 * involving three or four, so the algorithms used here can be
1666 * simple and inefficient.
1667 *
1668 * It must also treat the omission of a previously observed device as
1669 * notification that the device no longer exists.
1670 *
9053ead2
DC
1671 * Note that this function is serialized with hv_eject_device_work(),
1672 * because both are pushed to the ordered workqueue hbus->wq.
4daace0d
JO
1673 */
1674static void pci_devices_present_work(struct work_struct *work)
1675{
1676 u32 child_no;
1677 bool found;
1678 struct list_head *iter;
1679 struct pci_function_description *new_desc;
1680 struct hv_pci_dev *hpdev;
1681 struct hv_pcibus_device *hbus;
1682 struct list_head removed;
1683 struct hv_dr_work *dr_wrk;
1684 struct hv_dr_state *dr = NULL;
1685 unsigned long flags;
1686
1687 dr_wrk = container_of(work, struct hv_dr_work, wrk);
1688 hbus = dr_wrk->bus;
1689 kfree(dr_wrk);
1690
1691 INIT_LIST_HEAD(&removed);
1692
4daace0d
JO
1693 /* Pull this off the queue and process it if it was the last one. */
1694 spin_lock_irqsave(&hbus->device_list_lock, flags);
1695 while (!list_empty(&hbus->dr_list)) {
1696 dr = list_first_entry(&hbus->dr_list, struct hv_dr_state,
1697 list_entry);
1698 list_del(&dr->list_entry);
1699
1700 /* Throw this away if the list still has stuff in it. */
1701 if (!list_empty(&hbus->dr_list)) {
1702 kfree(dr);
1703 continue;
1704 }
1705 }
1706 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1707
1708 if (!dr) {
4daace0d
JO
1709 put_hvpcibus(hbus);
1710 return;
1711 }
1712
1713 /* First, mark all existing children as reported missing. */
1714 spin_lock_irqsave(&hbus->device_list_lock, flags);
1715 list_for_each(iter, &hbus->children) {
1716 hpdev = container_of(iter, struct hv_pci_dev,
1717 list_entry);
1718 hpdev->reported_missing = true;
1719 }
1720 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1721
1722 /* Next, add back any reported devices. */
1723 for (child_no = 0; child_no < dr->device_count; child_no++) {
1724 found = false;
1725 new_desc = &dr->func[child_no];
1726
1727 spin_lock_irqsave(&hbus->device_list_lock, flags);
1728 list_for_each(iter, &hbus->children) {
1729 hpdev = container_of(iter, struct hv_pci_dev,
1730 list_entry);
1731 if ((hpdev->desc.win_slot.slot ==
1732 new_desc->win_slot.slot) &&
1733 (hpdev->desc.v_id == new_desc->v_id) &&
1734 (hpdev->desc.d_id == new_desc->d_id) &&
1735 (hpdev->desc.ser == new_desc->ser)) {
1736 hpdev->reported_missing = false;
1737 found = true;
1738 }
1739 }
1740 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1741
1742 if (!found) {
1743 hpdev = new_pcichild_device(hbus, new_desc);
1744 if (!hpdev)
1745 dev_err(&hbus->hdev->device,
1746 "couldn't record a child device.\n");
1747 }
1748 }
1749
1750 /* Move missing children to a list on the stack. */
1751 spin_lock_irqsave(&hbus->device_list_lock, flags);
1752 do {
1753 found = false;
1754 list_for_each(iter, &hbus->children) {
1755 hpdev = container_of(iter, struct hv_pci_dev,
1756 list_entry);
1757 if (hpdev->reported_missing) {
1758 found = true;
1759 put_pcichild(hpdev, hv_pcidev_ref_childlist);
4f1cb01a 1760 list_move_tail(&hpdev->list_entry, &removed);
4daace0d
JO
1761 break;
1762 }
1763 }
1764 } while (found);
1765 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1766
1767 /* Delete everything that should no longer exist. */
1768 while (!list_empty(&removed)) {
1769 hpdev = list_first_entry(&removed, struct hv_pci_dev,
1770 list_entry);
1771 list_del(&hpdev->list_entry);
1772 put_pcichild(hpdev, hv_pcidev_ref_initial);
1773 }
1774
691ac1dc 1775 switch (hbus->state) {
d3a78d8b
LL
1776 case hv_pcibus_installed:
1777 /*
691ac1dc
JL
1778 * Tell the core to rescan bus
1779 * because there may have been changes.
1780 */
4daace0d
JO
1781 pci_lock_rescan_remove();
1782 pci_scan_child_bus(hbus->pci_bus);
1783 pci_unlock_rescan_remove();
d3a78d8b
LL
1784 break;
1785
1786 case hv_pcibus_init:
1787 case hv_pcibus_probed:
4daace0d 1788 survey_child_resources(hbus);
d3a78d8b
LL
1789 break;
1790
1791 default:
1792 break;
4daace0d
JO
1793 }
1794
4daace0d
JO
1795 put_hvpcibus(hbus);
1796 kfree(dr);
1797}
1798
1799/**
1800 * hv_pci_devices_present() - Handles list of new children
1801 * @hbus: Root PCI bus, as understood by this driver
1802 * @relations: Packet from host listing children
1803 *
1804 * This function is invoked whenever a new list of devices for
1805 * this bus appears.
1806 */
1807static void hv_pci_devices_present(struct hv_pcibus_device *hbus,
1808 struct pci_bus_relations *relations)
1809{
1810 struct hv_dr_state *dr;
1811 struct hv_dr_work *dr_wrk;
1812 unsigned long flags;
ca038b7f 1813 bool pending_dr;
4daace0d
JO
1814
1815 dr_wrk = kzalloc(sizeof(*dr_wrk), GFP_NOWAIT);
1816 if (!dr_wrk)
1817 return;
1818
1819 dr = kzalloc(offsetof(struct hv_dr_state, func) +
1820 (sizeof(struct pci_function_description) *
1821 (relations->device_count)), GFP_NOWAIT);
1822 if (!dr) {
1823 kfree(dr_wrk);
1824 return;
1825 }
1826
1827 INIT_WORK(&dr_wrk->wrk, pci_devices_present_work);
1828 dr_wrk->bus = hbus;
1829 dr->device_count = relations->device_count;
1830 if (dr->device_count != 0) {
1831 memcpy(dr->func, relations->func,
1832 sizeof(struct pci_function_description) *
1833 dr->device_count);
1834 }
1835
1836 spin_lock_irqsave(&hbus->device_list_lock, flags);
ca038b7f
DC
1837 /*
1838 * If pending_dr is true, we have already queued a work,
1839 * which will see the new dr. Otherwise, we need to
1840 * queue a new work.
1841 */
1842 pending_dr = !list_empty(&hbus->dr_list);
4daace0d
JO
1843 list_add_tail(&dr->list_entry, &hbus->dr_list);
1844 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1845
ca038b7f
DC
1846 if (pending_dr) {
1847 kfree(dr_wrk);
1848 } else {
1849 get_hvpcibus(hbus);
1850 queue_work(hbus->wq, &dr_wrk->wrk);
1851 }
4daace0d
JO
1852}
1853
1854/**
1855 * hv_eject_device_work() - Asynchronously handles ejection
1856 * @work: Work struct embedded in internal device struct
1857 *
1858 * This function handles ejecting a device. Windows will
1859 * attempt to gracefully eject a device, waiting 60 seconds to
1860 * hear back from the guest OS that this completed successfully.
1861 * If this timer expires, the device will be forcibly removed.
1862 */
1863static void hv_eject_device_work(struct work_struct *work)
1864{
1865 struct pci_eject_response *ejct_pkt;
1866 struct hv_pci_dev *hpdev;
1867 struct pci_dev *pdev;
1868 unsigned long flags;
1869 int wslot;
1870 struct {
1871 struct pci_packet pkt;
0c6045d8 1872 u8 buffer[sizeof(struct pci_eject_response)];
4daace0d
JO
1873 } ctxt;
1874
1875 hpdev = container_of(work, struct hv_pci_dev, wrk);
1876
f2db6a6e 1877 WARN_ON(hpdev->state != hv_pcichild_ejecting);
4daace0d
JO
1878
1879 /*
1880 * Ejection can come before or after the PCI bus has been set up, so
1881 * attempt to find it and tear down the bus state, if it exists. This
1882 * must be done without constructs like pci_domain_nr(hbus->pci_bus)
1883 * because hbus->pci_bus may not exist yet.
1884 */
1885 wslot = wslot_to_devfn(hpdev->desc.win_slot.slot);
1886 pdev = pci_get_domain_bus_and_slot(hpdev->hbus->sysdata.domain, 0,
1887 wslot);
1888 if (pdev) {
414428c5 1889 pci_lock_rescan_remove();
4daace0d
JO
1890 pci_stop_and_remove_bus_device(pdev);
1891 pci_dev_put(pdev);
414428c5 1892 pci_unlock_rescan_remove();
4daace0d
JO
1893 }
1894
e74d2ebd
DC
1895 spin_lock_irqsave(&hpdev->hbus->device_list_lock, flags);
1896 list_del(&hpdev->list_entry);
1897 spin_unlock_irqrestore(&hpdev->hbus->device_list_lock, flags);
1898
4daace0d
JO
1899 memset(&ctxt, 0, sizeof(ctxt));
1900 ejct_pkt = (struct pci_eject_response *)&ctxt.pkt.message;
0c6045d8 1901 ejct_pkt->message_type.type = PCI_EJECTION_COMPLETE;
4daace0d
JO
1902 ejct_pkt->wslot.slot = hpdev->desc.win_slot.slot;
1903 vmbus_sendpacket(hpdev->hbus->hdev->channel, ejct_pkt,
1904 sizeof(*ejct_pkt), (unsigned long)&ctxt.pkt,
1905 VM_PKT_DATA_INBAND, 0);
1906
4daace0d
JO
1907 put_pcichild(hpdev, hv_pcidev_ref_childlist);
1908 put_pcichild(hpdev, hv_pcidev_ref_pnp);
1909 put_hvpcibus(hpdev->hbus);
1910}
1911
1912/**
1913 * hv_pci_eject_device() - Handles device ejection
1914 * @hpdev: Internal device tracking struct
1915 *
1916 * This function is invoked when an ejection packet arrives. It
1917 * just schedules work so that we don't re-enter the packet
1918 * delivery code handling the ejection.
1919 */
1920static void hv_pci_eject_device(struct hv_pci_dev *hpdev)
1921{
1922 hpdev->state = hv_pcichild_ejecting;
1923 get_pcichild(hpdev, hv_pcidev_ref_pnp);
1924 INIT_WORK(&hpdev->wrk, hv_eject_device_work);
1925 get_hvpcibus(hpdev->hbus);
9053ead2 1926 queue_work(hpdev->hbus->wq, &hpdev->wrk);
4daace0d
JO
1927}
1928
1929/**
1930 * hv_pci_onchannelcallback() - Handles incoming packets
1931 * @context: Internal bus tracking struct
1932 *
1933 * This function is invoked whenever the host sends a packet to
1934 * this channel (which is private to this root PCI bus).
1935 */
1936static void hv_pci_onchannelcallback(void *context)
1937{
1938 const int packet_size = 0x100;
1939 int ret;
1940 struct hv_pcibus_device *hbus = context;
1941 u32 bytes_recvd;
1942 u64 req_id;
1943 struct vmpacket_descriptor *desc;
1944 unsigned char *buffer;
1945 int bufferlen = packet_size;
1946 struct pci_packet *comp_packet;
1947 struct pci_response *response;
1948 struct pci_incoming_message *new_message;
1949 struct pci_bus_relations *bus_rel;
1950 struct pci_dev_incoming *dev_message;
1951 struct hv_pci_dev *hpdev;
1952
1953 buffer = kmalloc(bufferlen, GFP_ATOMIC);
1954 if (!buffer)
1955 return;
1956
1957 while (1) {
1958 ret = vmbus_recvpacket_raw(hbus->hdev->channel, buffer,
1959 bufferlen, &bytes_recvd, &req_id);
1960
1961 if (ret == -ENOBUFS) {
1962 kfree(buffer);
1963 /* Handle large packet */
1964 bufferlen = bytes_recvd;
1965 buffer = kmalloc(bytes_recvd, GFP_ATOMIC);
1966 if (!buffer)
1967 return;
1968 continue;
1969 }
1970
837d741e
VK
1971 /* Zero length indicates there are no more packets. */
1972 if (ret || !bytes_recvd)
1973 break;
1974
4daace0d
JO
1975 /*
1976 * All incoming packets must be at least as large as a
1977 * response.
1978 */
60fcdac8 1979 if (bytes_recvd <= sizeof(struct pci_response))
837d741e 1980 continue;
4daace0d
JO
1981 desc = (struct vmpacket_descriptor *)buffer;
1982
1983 switch (desc->type) {
1984 case VM_PKT_COMP:
1985
1986 /*
1987 * The host is trusted, and thus it's safe to interpret
1988 * this transaction ID as a pointer.
1989 */
1990 comp_packet = (struct pci_packet *)req_id;
1991 response = (struct pci_response *)buffer;
1992 comp_packet->completion_func(comp_packet->compl_ctxt,
1993 response,
1994 bytes_recvd);
60fcdac8 1995 break;
4daace0d
JO
1996
1997 case VM_PKT_DATA_INBAND:
1998
1999 new_message = (struct pci_incoming_message *)buffer;
0c6045d8 2000 switch (new_message->message_type.type) {
4daace0d
JO
2001 case PCI_BUS_RELATIONS:
2002
2003 bus_rel = (struct pci_bus_relations *)buffer;
2004 if (bytes_recvd <
2005 offsetof(struct pci_bus_relations, func) +
2006 (sizeof(struct pci_function_description) *
2007 (bus_rel->device_count))) {
2008 dev_err(&hbus->hdev->device,
2009 "bus relations too small\n");
2010 break;
2011 }
2012
2013 hv_pci_devices_present(hbus, bus_rel);
2014 break;
2015
2016 case PCI_EJECT:
2017
2018 dev_message = (struct pci_dev_incoming *)buffer;
2019 hpdev = get_pcichild_wslot(hbus,
2020 dev_message->wslot.slot);
2021 if (hpdev) {
2022 hv_pci_eject_device(hpdev);
2023 put_pcichild(hpdev,
2024 hv_pcidev_ref_by_slot);
2025 }
2026 break;
2027
2028 default:
2029 dev_warn(&hbus->hdev->device,
2030 "Unimplemented protocol message %x\n",
0c6045d8 2031 new_message->message_type.type);
4daace0d
JO
2032 break;
2033 }
2034 break;
2035
2036 default:
2037 dev_err(&hbus->hdev->device,
2038 "unhandled packet type %d, tid %llx len %d\n",
2039 desc->type, req_id, bytes_recvd);
2040 break;
2041 }
4daace0d 2042 }
60fcdac8
VK
2043
2044 kfree(buffer);
4daace0d
JO
2045}
2046
2047/**
2048 * hv_pci_protocol_negotiation() - Set up protocol
2049 * @hdev: VMBus's tracking struct for this root PCI bus
2050 *
2051 * This driver is intended to support running on Windows 10
2052 * (server) and later versions. It will not run on earlier
2053 * versions, as they assume that many of the operations which
2054 * Linux needs accomplished with a spinlock held were done via
2055 * asynchronous messaging via VMBus. Windows 10 increases the
2056 * surface area of PCI emulation so that these actions can take
2057 * place by suspending a virtual processor for their duration.
2058 *
2059 * This function negotiates the channel protocol version,
2060 * failing if the host doesn't support the necessary protocol
2061 * level.
2062 */
2063static int hv_pci_protocol_negotiation(struct hv_device *hdev)
2064{
2065 struct pci_version_request *version_req;
2066 struct hv_pci_compl comp_pkt;
2067 struct pci_packet *pkt;
2068 int ret;
b1db7e7e 2069 int i;
4daace0d
JO
2070
2071 /*
2072 * Initiate the handshake with the host and negotiate
2073 * a version that the host can support. We start with the
2074 * highest version number and go down if the host cannot
2075 * support it.
2076 */
2077 pkt = kzalloc(sizeof(*pkt) + sizeof(*version_req), GFP_KERNEL);
2078 if (!pkt)
2079 return -ENOMEM;
2080
2081 init_completion(&comp_pkt.host_event);
2082 pkt->completion_func = hv_pci_generic_compl;
2083 pkt->compl_ctxt = &comp_pkt;
2084 version_req = (struct pci_version_request *)&pkt->message;
0c6045d8 2085 version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION;
4daace0d 2086
b1db7e7e
JL
2087 for (i = 0; i < ARRAY_SIZE(pci_protocol_versions); i++) {
2088 version_req->protocol_version = pci_protocol_versions[i];
2089 ret = vmbus_sendpacket(hdev->channel, version_req,
2090 sizeof(struct pci_version_request),
2091 (unsigned long)pkt, VM_PKT_DATA_INBAND,
2092 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
5e10a2fa
DC
2093 if (!ret)
2094 ret = wait_for_response(hdev, &comp_pkt.host_event);
2095
b1db7e7e
JL
2096 if (ret) {
2097 dev_err(&hdev->device,
5e10a2fa 2098 "PCI Pass-through VSP failed to request version: %d",
b1db7e7e
JL
2099 ret);
2100 goto exit;
2101 }
4daace0d 2102
b1db7e7e
JL
2103 if (comp_pkt.completion_status >= 0) {
2104 pci_protocol_version = pci_protocol_versions[i];
2105 dev_info(&hdev->device,
2106 "PCI VMBus probing: Using version %#x\n",
2107 pci_protocol_version);
2108 goto exit;
2109 }
2110
2111 if (comp_pkt.completion_status != STATUS_REVISION_MISMATCH) {
2112 dev_err(&hdev->device,
2113 "PCI Pass-through VSP failed version request: %#x",
2114 comp_pkt.completion_status);
2115 ret = -EPROTO;
2116 goto exit;
2117 }
4daace0d 2118
b1db7e7e 2119 reinit_completion(&comp_pkt.host_event);
4daace0d
JO
2120 }
2121
b1db7e7e
JL
2122 dev_err(&hdev->device,
2123 "PCI pass-through VSP failed to find supported version");
2124 ret = -EPROTO;
4daace0d
JO
2125
2126exit:
2127 kfree(pkt);
2128 return ret;
2129}
2130
2131/**
2132 * hv_pci_free_bridge_windows() - Release memory regions for the
2133 * bus
2134 * @hbus: Root PCI bus, as understood by this driver
2135 */
2136static void hv_pci_free_bridge_windows(struct hv_pcibus_device *hbus)
2137{
2138 /*
2139 * Set the resources back to the way they looked when they
2140 * were allocated by setting IORESOURCE_BUSY again.
2141 */
2142
2143 if (hbus->low_mmio_space && hbus->low_mmio_res) {
2144 hbus->low_mmio_res->flags |= IORESOURCE_BUSY;
696ca5e8
JO
2145 vmbus_free_mmio(hbus->low_mmio_res->start,
2146 resource_size(hbus->low_mmio_res));
4daace0d
JO
2147 }
2148
2149 if (hbus->high_mmio_space && hbus->high_mmio_res) {
2150 hbus->high_mmio_res->flags |= IORESOURCE_BUSY;
696ca5e8
JO
2151 vmbus_free_mmio(hbus->high_mmio_res->start,
2152 resource_size(hbus->high_mmio_res));
4daace0d
JO
2153 }
2154}
2155
2156/**
2157 * hv_pci_allocate_bridge_windows() - Allocate memory regions
2158 * for the bus
2159 * @hbus: Root PCI bus, as understood by this driver
2160 *
2161 * This function calls vmbus_allocate_mmio(), which is itself a
2162 * bit of a compromise. Ideally, we might change the pnp layer
2163 * in the kernel such that it comprehends either PCI devices
2164 * which are "grandchildren of ACPI," with some intermediate bus
2165 * node (in this case, VMBus) or change it such that it
2166 * understands VMBus. The pnp layer, however, has been declared
2167 * deprecated, and not subject to change.
2168 *
2169 * The workaround, implemented here, is to ask VMBus to allocate
2170 * MMIO space for this bus. VMBus itself knows which ranges are
2171 * appropriate by looking at its own ACPI objects. Then, after
2172 * these ranges are claimed, they're modified to look like they
2173 * would have looked if the ACPI and pnp code had allocated
2174 * bridge windows. These descriptors have to exist in this form
2175 * in order to satisfy the code which will get invoked when the
2176 * endpoint PCI function driver calls request_mem_region() or
2177 * request_mem_region_exclusive().
2178 *
2179 * Return: 0 on success, -errno on failure
2180 */
2181static int hv_pci_allocate_bridge_windows(struct hv_pcibus_device *hbus)
2182{
2183 resource_size_t align;
2184 int ret;
2185
2186 if (hbus->low_mmio_space) {
2187 align = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space));
2188 ret = vmbus_allocate_mmio(&hbus->low_mmio_res, hbus->hdev, 0,
2189 (u64)(u32)0xffffffff,
2190 hbus->low_mmio_space,
2191 align, false);
2192 if (ret) {
2193 dev_err(&hbus->hdev->device,
2194 "Need %#llx of low MMIO space. Consider reconfiguring the VM.\n",
2195 hbus->low_mmio_space);
2196 return ret;
2197 }
2198
2199 /* Modify this resource to become a bridge window. */
2200 hbus->low_mmio_res->flags |= IORESOURCE_WINDOW;
2201 hbus->low_mmio_res->flags &= ~IORESOURCE_BUSY;
2202 pci_add_resource(&hbus->resources_for_children,
2203 hbus->low_mmio_res);
2204 }
2205
2206 if (hbus->high_mmio_space) {
2207 align = 1ULL << (63 - __builtin_clzll(hbus->high_mmio_space));
2208 ret = vmbus_allocate_mmio(&hbus->high_mmio_res, hbus->hdev,
2209 0x100000000, -1,
2210 hbus->high_mmio_space, align,
2211 false);
2212 if (ret) {
2213 dev_err(&hbus->hdev->device,
2214 "Need %#llx of high MMIO space. Consider reconfiguring the VM.\n",
2215 hbus->high_mmio_space);
2216 goto release_low_mmio;
2217 }
2218
2219 /* Modify this resource to become a bridge window. */
2220 hbus->high_mmio_res->flags |= IORESOURCE_WINDOW;
2221 hbus->high_mmio_res->flags &= ~IORESOURCE_BUSY;
2222 pci_add_resource(&hbus->resources_for_children,
2223 hbus->high_mmio_res);
2224 }
2225
2226 return 0;
2227
2228release_low_mmio:
2229 if (hbus->low_mmio_res) {
696ca5e8
JO
2230 vmbus_free_mmio(hbus->low_mmio_res->start,
2231 resource_size(hbus->low_mmio_res));
4daace0d
JO
2232 }
2233
2234 return ret;
2235}
2236
2237/**
2238 * hv_allocate_config_window() - Find MMIO space for PCI Config
2239 * @hbus: Root PCI bus, as understood by this driver
2240 *
2241 * This function claims memory-mapped I/O space for accessing
2242 * configuration space for the functions on this bus.
2243 *
2244 * Return: 0 on success, -errno on failure
2245 */
2246static int hv_allocate_config_window(struct hv_pcibus_device *hbus)
2247{
2248 int ret;
2249
2250 /*
2251 * Set up a region of MMIO space to use for accessing configuration
2252 * space.
2253 */
2254 ret = vmbus_allocate_mmio(&hbus->mem_config, hbus->hdev, 0, -1,
2255 PCI_CONFIG_MMIO_LENGTH, 0x1000, false);
2256 if (ret)
2257 return ret;
2258
2259 /*
2260 * vmbus_allocate_mmio() gets used for allocating both device endpoint
2261 * resource claims (those which cannot be overlapped) and the ranges
2262 * which are valid for the children of this bus, which are intended
2263 * to be overlapped by those children. Set the flag on this claim
2264 * meaning that this region can't be overlapped.
2265 */
2266
2267 hbus->mem_config->flags |= IORESOURCE_BUSY;
2268
2269 return 0;
2270}
2271
2272static void hv_free_config_window(struct hv_pcibus_device *hbus)
2273{
696ca5e8 2274 vmbus_free_mmio(hbus->mem_config->start, PCI_CONFIG_MMIO_LENGTH);
4daace0d
JO
2275}
2276
2277/**
2278 * hv_pci_enter_d0() - Bring the "bus" into the D0 power state
2279 * @hdev: VMBus's tracking struct for this root PCI bus
2280 *
2281 * Return: 0 on success, -errno on failure
2282 */
2283static int hv_pci_enter_d0(struct hv_device *hdev)
2284{
2285 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2286 struct pci_bus_d0_entry *d0_entry;
2287 struct hv_pci_compl comp_pkt;
2288 struct pci_packet *pkt;
2289 int ret;
2290
2291 /*
2292 * Tell the host that the bus is ready to use, and moved into the
2293 * powered-on state. This includes telling the host which region
2294 * of memory-mapped I/O space has been chosen for configuration space
2295 * access.
2296 */
2297 pkt = kzalloc(sizeof(*pkt) + sizeof(*d0_entry), GFP_KERNEL);
2298 if (!pkt)
2299 return -ENOMEM;
2300
2301 init_completion(&comp_pkt.host_event);
2302 pkt->completion_func = hv_pci_generic_compl;
2303 pkt->compl_ctxt = &comp_pkt;
2304 d0_entry = (struct pci_bus_d0_entry *)&pkt->message;
0c6045d8 2305 d0_entry->message_type.type = PCI_BUS_D0ENTRY;
4daace0d
JO
2306 d0_entry->mmio_base = hbus->mem_config->start;
2307
2308 ret = vmbus_sendpacket(hdev->channel, d0_entry, sizeof(*d0_entry),
2309 (unsigned long)pkt, VM_PKT_DATA_INBAND,
2310 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
5e10a2fa
DC
2311 if (!ret)
2312 ret = wait_for_response(hdev, &comp_pkt.host_event);
2313
4daace0d
JO
2314 if (ret)
2315 goto exit;
2316
4daace0d
JO
2317 if (comp_pkt.completion_status < 0) {
2318 dev_err(&hdev->device,
2319 "PCI Pass-through VSP failed D0 Entry with status %x\n",
2320 comp_pkt.completion_status);
2321 ret = -EPROTO;
2322 goto exit;
2323 }
2324
2325 ret = 0;
2326
2327exit:
2328 kfree(pkt);
2329 return ret;
2330}
2331
2332/**
2333 * hv_pci_query_relations() - Ask host to send list of child
2334 * devices
2335 * @hdev: VMBus's tracking struct for this root PCI bus
2336 *
2337 * Return: 0 on success, -errno on failure
2338 */
2339static int hv_pci_query_relations(struct hv_device *hdev)
2340{
2341 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2342 struct pci_message message;
2343 struct completion comp;
2344 int ret;
2345
2346 /* Ask the host to send along the list of child devices */
2347 init_completion(&comp);
2348 if (cmpxchg(&hbus->survey_event, NULL, &comp))
2349 return -ENOTEMPTY;
2350
2351 memset(&message, 0, sizeof(message));
0c6045d8 2352 message.type = PCI_QUERY_BUS_RELATIONS;
4daace0d
JO
2353
2354 ret = vmbus_sendpacket(hdev->channel, &message, sizeof(message),
2355 0, VM_PKT_DATA_INBAND, 0);
5e10a2fa
DC
2356 if (!ret)
2357 ret = wait_for_response(hdev, &comp);
4daace0d 2358
5e10a2fa 2359 return ret;
4daace0d
JO
2360}
2361
2362/**
2363 * hv_send_resources_allocated() - Report local resource choices
2364 * @hdev: VMBus's tracking struct for this root PCI bus
2365 *
2366 * The host OS is expecting to be sent a request as a message
2367 * which contains all the resources that the device will use.
2368 * The response contains those same resources, "translated"
2369 * which is to say, the values which should be used by the
2370 * hardware, when it delivers an interrupt. (MMIO resources are
2371 * used in local terms.) This is nice for Windows, and lines up
2372 * with the FDO/PDO split, which doesn't exist in Linux. Linux
2373 * is deeply expecting to scan an emulated PCI configuration
2374 * space. So this message is sent here only to drive the state
2375 * machine on the host forward.
2376 *
2377 * Return: 0 on success, -errno on failure
2378 */
2379static int hv_send_resources_allocated(struct hv_device *hdev)
2380{
2381 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2382 struct pci_resources_assigned *res_assigned;
7dcf90e9 2383 struct pci_resources_assigned2 *res_assigned2;
4daace0d
JO
2384 struct hv_pci_compl comp_pkt;
2385 struct hv_pci_dev *hpdev;
2386 struct pci_packet *pkt;
7dcf90e9 2387 size_t size_res;
4daace0d
JO
2388 u32 wslot;
2389 int ret;
2390
7dcf90e9
JL
2391 size_res = (pci_protocol_version < PCI_PROTOCOL_VERSION_1_2)
2392 ? sizeof(*res_assigned) : sizeof(*res_assigned2);
2393
2394 pkt = kmalloc(sizeof(*pkt) + size_res, GFP_KERNEL);
4daace0d
JO
2395 if (!pkt)
2396 return -ENOMEM;
2397
2398 ret = 0;
2399
2400 for (wslot = 0; wslot < 256; wslot++) {
2401 hpdev = get_pcichild_wslot(hbus, wslot);
2402 if (!hpdev)
2403 continue;
2404
7dcf90e9 2405 memset(pkt, 0, sizeof(*pkt) + size_res);
4daace0d
JO
2406 init_completion(&comp_pkt.host_event);
2407 pkt->completion_func = hv_pci_generic_compl;
2408 pkt->compl_ctxt = &comp_pkt;
4daace0d 2409
7dcf90e9
JL
2410 if (pci_protocol_version < PCI_PROTOCOL_VERSION_1_2) {
2411 res_assigned =
2412 (struct pci_resources_assigned *)&pkt->message;
2413 res_assigned->message_type.type =
2414 PCI_RESOURCES_ASSIGNED;
2415 res_assigned->wslot.slot = hpdev->desc.win_slot.slot;
2416 } else {
2417 res_assigned2 =
2418 (struct pci_resources_assigned2 *)&pkt->message;
2419 res_assigned2->message_type.type =
2420 PCI_RESOURCES_ASSIGNED2;
2421 res_assigned2->wslot.slot = hpdev->desc.win_slot.slot;
2422 }
4daace0d
JO
2423 put_pcichild(hpdev, hv_pcidev_ref_by_slot);
2424
7dcf90e9
JL
2425 ret = vmbus_sendpacket(hdev->channel, &pkt->message,
2426 size_res, (unsigned long)pkt,
2427 VM_PKT_DATA_INBAND,
2428 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
5e10a2fa
DC
2429 if (!ret)
2430 ret = wait_for_response(hdev, &comp_pkt.host_event);
4daace0d
JO
2431 if (ret)
2432 break;
2433
4daace0d
JO
2434 if (comp_pkt.completion_status < 0) {
2435 ret = -EPROTO;
2436 dev_err(&hdev->device,
2437 "resource allocated returned 0x%x",
2438 comp_pkt.completion_status);
2439 break;
2440 }
2441 }
2442
2443 kfree(pkt);
2444 return ret;
2445}
2446
2447/**
2448 * hv_send_resources_released() - Report local resources
2449 * released
2450 * @hdev: VMBus's tracking struct for this root PCI bus
2451 *
2452 * Return: 0 on success, -errno on failure
2453 */
2454static int hv_send_resources_released(struct hv_device *hdev)
2455{
2456 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2457 struct pci_child_message pkt;
2458 struct hv_pci_dev *hpdev;
2459 u32 wslot;
2460 int ret;
2461
2462 for (wslot = 0; wslot < 256; wslot++) {
2463 hpdev = get_pcichild_wslot(hbus, wslot);
2464 if (!hpdev)
2465 continue;
2466
2467 memset(&pkt, 0, sizeof(pkt));
0c6045d8 2468 pkt.message_type.type = PCI_RESOURCES_RELEASED;
4daace0d
JO
2469 pkt.wslot.slot = hpdev->desc.win_slot.slot;
2470
2471 put_pcichild(hpdev, hv_pcidev_ref_by_slot);
2472
2473 ret = vmbus_sendpacket(hdev->channel, &pkt, sizeof(pkt), 0,
2474 VM_PKT_DATA_INBAND, 0);
2475 if (ret)
2476 return ret;
2477 }
2478
2479 return 0;
2480}
2481
2482static void get_hvpcibus(struct hv_pcibus_device *hbus)
2483{
2484 atomic_inc(&hbus->remove_lock);
2485}
2486
2487static void put_hvpcibus(struct hv_pcibus_device *hbus)
2488{
2489 if (atomic_dec_and_test(&hbus->remove_lock))
2490 complete(&hbus->remove_event);
2491}
2492
2493/**
2494 * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
2495 * @hdev: VMBus's tracking struct for this root PCI bus
2496 * @dev_id: Identifies the device itself
2497 *
2498 * Return: 0 on success, -errno on failure
2499 */
2500static int hv_pci_probe(struct hv_device *hdev,
2501 const struct hv_vmbus_device_id *dev_id)
2502{
2503 struct hv_pcibus_device *hbus;
2504 int ret;
2505
be66b673
JL
2506 /*
2507 * hv_pcibus_device contains the hypercall arguments for retargeting in
2508 * hv_irq_unmask(). Those must not cross a page boundary.
2509 */
2510 BUILD_BUG_ON(sizeof(*hbus) > PAGE_SIZE);
2511
2512 hbus = (struct hv_pcibus_device *)get_zeroed_page(GFP_KERNEL);
4daace0d
JO
2513 if (!hbus)
2514 return -ENOMEM;
d3a78d8b 2515 hbus->state = hv_pcibus_init;
4daace0d
JO
2516
2517 /*
2518 * The PCI bus "domain" is what is called "segment" in ACPI and
2519 * other specs. Pull it from the instance ID, to get something
2520 * unique. Bytes 8 and 9 are what is used in Windows guests, so
2521 * do the same thing for consistency. Note that, since this code
2522 * only runs in a Hyper-V VM, Hyper-V can (and does) guarantee
2523 * that (1) the only domain in use for something that looks like
2524 * a physical PCI bus (which is actually emulated by the
2525 * hypervisor) is domain 0 and (2) there will be no overlap
2526 * between domains derived from these instance IDs in the same
2527 * VM.
2528 */
2529 hbus->sysdata.domain = hdev->dev_instance.b[9] |
2530 hdev->dev_instance.b[8] << 8;
2531
2532 hbus->hdev = hdev;
2533 atomic_inc(&hbus->remove_lock);
2534 INIT_LIST_HEAD(&hbus->children);
2535 INIT_LIST_HEAD(&hbus->dr_list);
2536 INIT_LIST_HEAD(&hbus->resources_for_children);
2537 spin_lock_init(&hbus->config_lock);
2538 spin_lock_init(&hbus->device_list_lock);
0de8ce3e 2539 spin_lock_init(&hbus->retarget_msi_interrupt_lock);
4daace0d 2540 init_completion(&hbus->remove_event);
9053ead2
DC
2541 hbus->wq = alloc_ordered_workqueue("hv_pci_%x", 0,
2542 hbus->sysdata.domain);
2543 if (!hbus->wq) {
2544 ret = -ENOMEM;
2545 goto free_bus;
2546 }
4daace0d
JO
2547
2548 ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
2549 hv_pci_onchannelcallback, hbus);
2550 if (ret)
9053ead2 2551 goto destroy_wq;
4daace0d
JO
2552
2553 hv_set_drvdata(hdev, hbus);
2554
2555 ret = hv_pci_protocol_negotiation(hdev);
2556 if (ret)
2557 goto close;
2558
2559 ret = hv_allocate_config_window(hbus);
2560 if (ret)
2561 goto close;
2562
2563 hbus->cfg_addr = ioremap(hbus->mem_config->start,
2564 PCI_CONFIG_MMIO_LENGTH);
2565 if (!hbus->cfg_addr) {
2566 dev_err(&hdev->device,
2567 "Unable to map a virtual address for config space\n");
2568 ret = -ENOMEM;
2569 goto free_config;
2570 }
2571
2572 hbus->sysdata.fwnode = irq_domain_alloc_fwnode(hbus);
2573 if (!hbus->sysdata.fwnode) {
2574 ret = -ENOMEM;
2575 goto unmap;
2576 }
2577
2578 ret = hv_pcie_init_irq_domain(hbus);
2579 if (ret)
2580 goto free_fwnode;
2581
2582 ret = hv_pci_query_relations(hdev);
2583 if (ret)
2584 goto free_irq_domain;
2585
2586 ret = hv_pci_enter_d0(hdev);
2587 if (ret)
2588 goto free_irq_domain;
2589
2590 ret = hv_pci_allocate_bridge_windows(hbus);
2591 if (ret)
2592 goto free_irq_domain;
2593
2594 ret = hv_send_resources_allocated(hdev);
2595 if (ret)
2596 goto free_windows;
2597
2598 prepopulate_bars(hbus);
2599
2600 hbus->state = hv_pcibus_probed;
2601
2602 ret = create_root_hv_pci_bus(hbus);
2603 if (ret)
2604 goto free_windows;
2605
2606 return 0;
2607
2608free_windows:
2609 hv_pci_free_bridge_windows(hbus);
2610free_irq_domain:
2611 irq_domain_remove(hbus->irq_domain);
2612free_fwnode:
2613 irq_domain_free_fwnode(hbus->sysdata.fwnode);
2614unmap:
2615 iounmap(hbus->cfg_addr);
2616free_config:
2617 hv_free_config_window(hbus);
2618close:
2619 vmbus_close(hdev->channel);
9053ead2
DC
2620destroy_wq:
2621 destroy_workqueue(hbus->wq);
4daace0d 2622free_bus:
be66b673 2623 free_page((unsigned long)hbus);
4daace0d
JO
2624 return ret;
2625}
2626
17978524 2627static void hv_pci_bus_exit(struct hv_device *hdev)
4daace0d 2628{
17978524
DC
2629 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2630 struct {
4daace0d 2631 struct pci_packet teardown_packet;
17978524 2632 u8 buffer[sizeof(struct pci_message)];
4daace0d
JO
2633 } pkt;
2634 struct pci_bus_relations relations;
2635 struct hv_pci_compl comp_pkt;
17978524 2636 int ret;
4daace0d 2637
17978524
DC
2638 /*
2639 * After the host sends the RESCIND_CHANNEL message, it doesn't
2640 * access the per-channel ringbuffer any longer.
2641 */
2642 if (hdev->channel->rescind)
2643 return;
2644
2645 /* Delete any children which might still exist. */
2646 memset(&relations, 0, sizeof(relations));
2647 hv_pci_devices_present(hbus, &relations);
2648
2649 ret = hv_send_resources_released(hdev);
2650 if (ret)
2651 dev_err(&hdev->device,
2652 "Couldn't send resources released packet(s)\n");
4daace0d 2653
4daace0d
JO
2654 memset(&pkt.teardown_packet, 0, sizeof(pkt.teardown_packet));
2655 init_completion(&comp_pkt.host_event);
2656 pkt.teardown_packet.completion_func = hv_pci_generic_compl;
2657 pkt.teardown_packet.compl_ctxt = &comp_pkt;
0c6045d8 2658 pkt.teardown_packet.message[0].type = PCI_BUS_D0EXIT;
4daace0d
JO
2659
2660 ret = vmbus_sendpacket(hdev->channel, &pkt.teardown_packet.message,
2661 sizeof(struct pci_message),
2662 (unsigned long)&pkt.teardown_packet,
2663 VM_PKT_DATA_INBAND,
2664 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
2665 if (!ret)
2666 wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ);
17978524
DC
2667}
2668
2669/**
2670 * hv_pci_remove() - Remove routine for this VMBus channel
2671 * @hdev: VMBus's tracking struct for this root PCI bus
2672 *
2673 * Return: 0 on success, -errno on failure
2674 */
2675static int hv_pci_remove(struct hv_device *hdev)
2676{
2677 struct hv_pcibus_device *hbus;
4daace0d 2678
17978524 2679 hbus = hv_get_drvdata(hdev);
4daace0d
JO
2680 if (hbus->state == hv_pcibus_installed) {
2681 /* Remove the bus from PCI's point of view. */
2682 pci_lock_rescan_remove();
2683 pci_stop_root_bus(hbus->pci_bus);
2684 pci_remove_root_bus(hbus->pci_bus);
2685 pci_unlock_rescan_remove();
d3a78d8b 2686 hbus->state = hv_pcibus_removed;
4daace0d
JO
2687 }
2688
17978524 2689 hv_pci_bus_exit(hdev);
deb22e5c 2690
4daace0d
JO
2691 vmbus_close(hdev->channel);
2692
4daace0d
JO
2693 iounmap(hbus->cfg_addr);
2694 hv_free_config_window(hbus);
2695 pci_free_resource_list(&hbus->resources_for_children);
2696 hv_pci_free_bridge_windows(hbus);
2697 irq_domain_remove(hbus->irq_domain);
2698 irq_domain_free_fwnode(hbus->sysdata.fwnode);
2699 put_hvpcibus(hbus);
2700 wait_for_completion(&hbus->remove_event);
9053ead2 2701 destroy_workqueue(hbus->wq);
be66b673 2702 free_page((unsigned long)hbus);
4daace0d
JO
2703 return 0;
2704}
2705
2706static const struct hv_vmbus_device_id hv_pci_id_table[] = {
2707 /* PCI Pass-through Class ID */
2708 /* 44C4F61D-4444-4400-9D52-802E27EDE19F */
2709 { HV_PCIE_GUID, },
2710 { },
2711};
2712
2713MODULE_DEVICE_TABLE(vmbus, hv_pci_id_table);
2714
2715static struct hv_driver hv_pci_drv = {
2716 .name = "hv_pci",
2717 .id_table = hv_pci_id_table,
2718 .probe = hv_pci_probe,
2719 .remove = hv_pci_remove,
2720};
2721
2722static void __exit exit_hv_pci_drv(void)
2723{
2724 vmbus_driver_unregister(&hv_pci_drv);
2725}
2726
2727static int __init init_hv_pci_drv(void)
2728{
2729 return vmbus_driver_register(&hv_pci_drv);
2730}
2731
2732module_init(init_hv_pci_drv);
2733module_exit(exit_hv_pci_drv);
2734
2735MODULE_DESCRIPTION("Hyper-V PCI");
2736MODULE_LICENSE("GPL v2");