]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - drivers/pci/host/pci-hyperv.c
PCI: hv: Specify CPU_AFFINITY_ALL for MSI affinity when >= 32 CPUs
[mirror_ubuntu-bionic-kernel.git] / drivers / pci / host / pci-hyperv.c
CommitLineData
4daace0d
JO
1/*
2 * Copyright (c) Microsoft Corporation.
3 *
4 * Author:
5 * Jake Oshins <jakeo@microsoft.com>
6 *
7 * This driver acts as a paravirtual front-end for PCI Express root buses.
8 * When a PCI Express function (either an entire device or an SR-IOV
9 * Virtual Function) is being passed through to the VM, this driver exposes
10 * a new bus to the guest VM. This is modeled as a root PCI bus because
11 * no bridges are being exposed to the VM. In fact, with a "Generation 2"
12 * VM within Hyper-V, there may seem to be no PCI bus at all in the VM
13 * until a device as been exposed using this driver.
14 *
15 * Each root PCI bus has its own PCI domain, which is called "Segment" in
16 * the PCI Firmware Specifications. Thus while each device passed through
17 * to the VM using this front-end will appear at "device 0", the domain will
18 * be unique. Typically, each bus will have one PCI function on it, though
19 * this driver does support more than one.
20 *
21 * In order to map the interrupts from the device through to the guest VM,
22 * this driver also implements an IRQ Domain, which handles interrupts (either
23 * MSI or MSI-X) associated with the functions on the bus. As interrupts are
24 * set up, torn down, or reaffined, this driver communicates with the
25 * underlying hypervisor to adjust the mappings in the I/O MMU so that each
26 * interrupt will be delivered to the correct virtual processor at the right
27 * vector. This driver does not support level-triggered (line-based)
28 * interrupts, and will report that the Interrupt Line register in the
29 * function's configuration space is zero.
30 *
31 * The rest of this driver mostly maps PCI concepts onto underlying Hyper-V
32 * facilities. For instance, the configuration space of a function exposed
33 * by Hyper-V is mapped into a single page of memory space, and the
34 * read and write handlers for config space must be aware of this mechanism.
35 * Similarly, device setup and teardown involves messages sent to and from
36 * the PCI back-end driver in Hyper-V.
37 *
38 * This program is free software; you can redistribute it and/or modify it
39 * under the terms of the GNU General Public License version 2 as published
40 * by the Free Software Foundation.
41 *
42 * This program is distributed in the hope that it will be useful, but
43 * WITHOUT ANY WARRANTY; without even the implied warranty of
44 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
45 * NON INFRINGEMENT. See the GNU General Public License for more
46 * details.
47 *
48 */
49
50#include <linux/kernel.h>
51#include <linux/module.h>
52#include <linux/pci.h>
53#include <linux/semaphore.h>
54#include <linux/irqdomain.h>
55#include <asm/irqdomain.h>
56#include <asm/apic.h>
57#include <linux/msi.h>
58#include <linux/hyperv.h>
59#include <asm/mshyperv.h>
60
61/*
62 * Protocol versions. The low word is the minor version, the high word the
63 * major version.
64 */
65
66#define PCI_MAKE_VERSION(major, minor) ((u32)(((major) << 16) | (major)))
67#define PCI_MAJOR_VERSION(version) ((u32)(version) >> 16)
68#define PCI_MINOR_VERSION(version) ((u32)(version) & 0xff)
69
70enum {
71 PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1),
72 PCI_PROTOCOL_VERSION_CURRENT = PCI_PROTOCOL_VERSION_1_1
73};
74
433fcf6b 75#define CPU_AFFINITY_ALL -1ULL
4daace0d
JO
76#define PCI_CONFIG_MMIO_LENGTH 0x2000
77#define CFG_PAGE_OFFSET 0x1000
78#define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET)
79
80#define MAX_SUPPORTED_MSI_MESSAGES 0x400
81
82/*
83 * Message Types
84 */
85
86enum pci_message_type {
87 /*
88 * Version 1.1
89 */
90 PCI_MESSAGE_BASE = 0x42490000,
91 PCI_BUS_RELATIONS = PCI_MESSAGE_BASE + 0,
92 PCI_QUERY_BUS_RELATIONS = PCI_MESSAGE_BASE + 1,
93 PCI_POWER_STATE_CHANGE = PCI_MESSAGE_BASE + 4,
94 PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5,
95 PCI_QUERY_RESOURCE_RESOURCES = PCI_MESSAGE_BASE + 6,
96 PCI_BUS_D0ENTRY = PCI_MESSAGE_BASE + 7,
97 PCI_BUS_D0EXIT = PCI_MESSAGE_BASE + 8,
98 PCI_READ_BLOCK = PCI_MESSAGE_BASE + 9,
99 PCI_WRITE_BLOCK = PCI_MESSAGE_BASE + 0xA,
100 PCI_EJECT = PCI_MESSAGE_BASE + 0xB,
101 PCI_QUERY_STOP = PCI_MESSAGE_BASE + 0xC,
102 PCI_REENABLE = PCI_MESSAGE_BASE + 0xD,
103 PCI_QUERY_STOP_FAILED = PCI_MESSAGE_BASE + 0xE,
104 PCI_EJECTION_COMPLETE = PCI_MESSAGE_BASE + 0xF,
105 PCI_RESOURCES_ASSIGNED = PCI_MESSAGE_BASE + 0x10,
106 PCI_RESOURCES_RELEASED = PCI_MESSAGE_BASE + 0x11,
107 PCI_INVALIDATE_BLOCK = PCI_MESSAGE_BASE + 0x12,
108 PCI_QUERY_PROTOCOL_VERSION = PCI_MESSAGE_BASE + 0x13,
109 PCI_CREATE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x14,
110 PCI_DELETE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x15,
111 PCI_MESSAGE_MAXIMUM
112};
113
114/*
115 * Structures defining the virtual PCI Express protocol.
116 */
117
118union pci_version {
119 struct {
120 u16 minor_version;
121 u16 major_version;
122 } parts;
123 u32 version;
124} __packed;
125
126/*
127 * Function numbers are 8-bits wide on Express, as interpreted through ARI,
128 * which is all this driver does. This representation is the one used in
129 * Windows, which is what is expected when sending this back and forth with
130 * the Hyper-V parent partition.
131 */
132union win_slot_encoding {
133 struct {
60e2e2fb
DC
134 u32 dev:5;
135 u32 func:3;
4daace0d
JO
136 u32 reserved:24;
137 } bits;
138 u32 slot;
139} __packed;
140
141/*
142 * Pretty much as defined in the PCI Specifications.
143 */
144struct pci_function_description {
145 u16 v_id; /* vendor ID */
146 u16 d_id; /* device ID */
147 u8 rev;
148 u8 prog_intf;
149 u8 subclass;
150 u8 base_class;
151 u32 subsystem_id;
152 union win_slot_encoding win_slot;
153 u32 ser; /* serial number */
154} __packed;
155
156/**
157 * struct hv_msi_desc
158 * @vector: IDT entry
159 * @delivery_mode: As defined in Intel's Programmer's
160 * Reference Manual, Volume 3, Chapter 8.
161 * @vector_count: Number of contiguous entries in the
162 * Interrupt Descriptor Table that are
163 * occupied by this Message-Signaled
164 * Interrupt. For "MSI", as first defined
165 * in PCI 2.2, this can be between 1 and
166 * 32. For "MSI-X," as first defined in PCI
167 * 3.0, this must be 1, as each MSI-X table
168 * entry would have its own descriptor.
169 * @reserved: Empty space
170 * @cpu_mask: All the target virtual processors.
171 */
172struct hv_msi_desc {
173 u8 vector;
174 u8 delivery_mode;
175 u16 vector_count;
176 u32 reserved;
177 u64 cpu_mask;
178} __packed;
179
180/**
181 * struct tran_int_desc
182 * @reserved: unused, padding
183 * @vector_count: same as in hv_msi_desc
184 * @data: This is the "data payload" value that is
185 * written by the device when it generates
186 * a message-signaled interrupt, either MSI
187 * or MSI-X.
188 * @address: This is the address to which the data
189 * payload is written on interrupt
190 * generation.
191 */
192struct tran_int_desc {
193 u16 reserved;
194 u16 vector_count;
195 u32 data;
196 u64 address;
197} __packed;
198
199/*
200 * A generic message format for virtual PCI.
201 * Specific message formats are defined later in the file.
202 */
203
204struct pci_message {
0c6045d8 205 u32 type;
4daace0d
JO
206} __packed;
207
208struct pci_child_message {
0c6045d8 209 struct pci_message message_type;
4daace0d
JO
210 union win_slot_encoding wslot;
211} __packed;
212
213struct pci_incoming_message {
214 struct vmpacket_descriptor hdr;
215 struct pci_message message_type;
216} __packed;
217
218struct pci_response {
219 struct vmpacket_descriptor hdr;
220 s32 status; /* negative values are failures */
221} __packed;
222
223struct pci_packet {
224 void (*completion_func)(void *context, struct pci_response *resp,
225 int resp_packet_size);
226 void *compl_ctxt;
0c6045d8
DC
227
228 struct pci_message message[0];
4daace0d
JO
229};
230
231/*
232 * Specific message types supporting the PCI protocol.
233 */
234
235/*
236 * Version negotiation message. Sent from the guest to the host.
237 * The guest is free to try different versions until the host
238 * accepts the version.
239 *
240 * pci_version: The protocol version requested.
241 * is_last_attempt: If TRUE, this is the last version guest will request.
242 * reservedz: Reserved field, set to zero.
243 */
244
245struct pci_version_request {
246 struct pci_message message_type;
247 enum pci_message_type protocol_version;
248} __packed;
249
250/*
251 * Bus D0 Entry. This is sent from the guest to the host when the virtual
252 * bus (PCI Express port) is ready for action.
253 */
254
255struct pci_bus_d0_entry {
256 struct pci_message message_type;
257 u32 reserved;
258 u64 mmio_base;
259} __packed;
260
261struct pci_bus_relations {
262 struct pci_incoming_message incoming;
263 u32 device_count;
7d0f8eec 264 struct pci_function_description func[0];
4daace0d
JO
265} __packed;
266
267struct pci_q_res_req_response {
268 struct vmpacket_descriptor hdr;
269 s32 status; /* negative values are failures */
270 u32 probed_bar[6];
271} __packed;
272
273struct pci_set_power {
274 struct pci_message message_type;
275 union win_slot_encoding wslot;
276 u32 power_state; /* In Windows terms */
277 u32 reserved;
278} __packed;
279
280struct pci_set_power_response {
281 struct vmpacket_descriptor hdr;
282 s32 status; /* negative values are failures */
283 union win_slot_encoding wslot;
284 u32 resultant_state; /* In Windows terms */
285 u32 reserved;
286} __packed;
287
288struct pci_resources_assigned {
289 struct pci_message message_type;
290 union win_slot_encoding wslot;
291 u8 memory_range[0x14][6]; /* not used here */
292 u32 msi_descriptors;
293 u32 reserved[4];
294} __packed;
295
296struct pci_create_interrupt {
297 struct pci_message message_type;
298 union win_slot_encoding wslot;
299 struct hv_msi_desc int_desc;
300} __packed;
301
302struct pci_create_int_response {
303 struct pci_response response;
304 u32 reserved;
305 struct tran_int_desc int_desc;
306} __packed;
307
308struct pci_delete_interrupt {
309 struct pci_message message_type;
310 union win_slot_encoding wslot;
311 struct tran_int_desc int_desc;
312} __packed;
313
314struct pci_dev_incoming {
315 struct pci_incoming_message incoming;
316 union win_slot_encoding wslot;
317} __packed;
318
319struct pci_eject_response {
0c6045d8 320 struct pci_message message_type;
4daace0d
JO
321 union win_slot_encoding wslot;
322 u32 status;
323} __packed;
324
325static int pci_ring_size = (4 * PAGE_SIZE);
326
327/*
328 * Definitions or interrupt steering hypercall.
329 */
330#define HV_PARTITION_ID_SELF ((u64)-1)
331#define HVCALL_RETARGET_INTERRUPT 0x7e
332
333struct retarget_msi_interrupt {
334 u64 partition_id; /* use "self" */
335 u64 device_id;
336 u32 source; /* 1 for MSI(-X) */
337 u32 reserved1;
338 u32 address;
339 u32 data;
340 u64 reserved2;
341 u32 vector;
342 u32 flags;
343 u64 vp_mask;
344} __packed;
345
346/*
347 * Driver specific state.
348 */
349
350enum hv_pcibus_state {
351 hv_pcibus_init = 0,
352 hv_pcibus_probed,
353 hv_pcibus_installed,
d3a78d8b 354 hv_pcibus_removed,
4daace0d
JO
355 hv_pcibus_maximum
356};
357
358struct hv_pcibus_device {
359 struct pci_sysdata sysdata;
360 enum hv_pcibus_state state;
361 atomic_t remove_lock;
362 struct hv_device *hdev;
363 resource_size_t low_mmio_space;
364 resource_size_t high_mmio_space;
365 struct resource *mem_config;
366 struct resource *low_mmio_res;
367 struct resource *high_mmio_res;
368 struct completion *survey_event;
369 struct completion remove_event;
370 struct pci_bus *pci_bus;
371 spinlock_t config_lock; /* Avoid two threads writing index page */
372 spinlock_t device_list_lock; /* Protect lists below */
373 void __iomem *cfg_addr;
374
375 struct semaphore enum_sem;
376 struct list_head resources_for_children;
377
378 struct list_head children;
379 struct list_head dr_list;
4daace0d
JO
380
381 struct msi_domain_info msi_info;
382 struct msi_controller msi_chip;
383 struct irq_domain *irq_domain;
0de8ce3e
LL
384 struct retarget_msi_interrupt retarget_msi_interrupt_params;
385 spinlock_t retarget_msi_interrupt_lock;
4daace0d
JO
386};
387
388/*
389 * Tracks "Device Relations" messages from the host, which must be both
390 * processed in order and deferred so that they don't run in the context
391 * of the incoming packet callback.
392 */
393struct hv_dr_work {
394 struct work_struct wrk;
395 struct hv_pcibus_device *bus;
396};
397
398struct hv_dr_state {
399 struct list_head list_entry;
400 u32 device_count;
7d0f8eec 401 struct pci_function_description func[0];
4daace0d
JO
402};
403
404enum hv_pcichild_state {
405 hv_pcichild_init = 0,
406 hv_pcichild_requirements,
407 hv_pcichild_resourced,
408 hv_pcichild_ejecting,
409 hv_pcichild_maximum
410};
411
412enum hv_pcidev_ref_reason {
413 hv_pcidev_ref_invalid = 0,
414 hv_pcidev_ref_initial,
415 hv_pcidev_ref_by_slot,
416 hv_pcidev_ref_packet,
417 hv_pcidev_ref_pnp,
418 hv_pcidev_ref_childlist,
419 hv_pcidev_irqdata,
420 hv_pcidev_ref_max
421};
422
423struct hv_pci_dev {
424 /* List protected by pci_rescan_remove_lock */
425 struct list_head list_entry;
426 atomic_t refs;
427 enum hv_pcichild_state state;
428 struct pci_function_description desc;
429 bool reported_missing;
430 struct hv_pcibus_device *hbus;
431 struct work_struct wrk;
432
433 /*
434 * What would be observed if one wrote 0xFFFFFFFF to a BAR and then
435 * read it back, for each of the BAR offsets within config space.
436 */
437 u32 probed_bar[6];
438};
439
440struct hv_pci_compl {
441 struct completion host_event;
442 s32 completion_status;
443};
444
445/**
446 * hv_pci_generic_compl() - Invoked for a completion packet
447 * @context: Set up by the sender of the packet.
448 * @resp: The response packet
449 * @resp_packet_size: Size in bytes of the packet
450 *
451 * This function is used to trigger an event and report status
452 * for any message for which the completion packet contains a
453 * status and nothing else.
454 */
a5b45b7b
DC
455static void hv_pci_generic_compl(void *context, struct pci_response *resp,
456 int resp_packet_size)
4daace0d
JO
457{
458 struct hv_pci_compl *comp_pkt = context;
459
460 if (resp_packet_size >= offsetofend(struct pci_response, status))
461 comp_pkt->completion_status = resp->status;
a5b45b7b
DC
462 else
463 comp_pkt->completion_status = -1;
464
4daace0d
JO
465 complete(&comp_pkt->host_event);
466}
467
468static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus,
469 u32 wslot);
470static void get_pcichild(struct hv_pci_dev *hv_pcidev,
471 enum hv_pcidev_ref_reason reason);
472static void put_pcichild(struct hv_pci_dev *hv_pcidev,
473 enum hv_pcidev_ref_reason reason);
474
475static void get_hvpcibus(struct hv_pcibus_device *hv_pcibus);
476static void put_hvpcibus(struct hv_pcibus_device *hv_pcibus);
477
478/**
479 * devfn_to_wslot() - Convert from Linux PCI slot to Windows
480 * @devfn: The Linux representation of PCI slot
481 *
482 * Windows uses a slightly different representation of PCI slot.
483 *
484 * Return: The Windows representation
485 */
486static u32 devfn_to_wslot(int devfn)
487{
488 union win_slot_encoding wslot;
489
490 wslot.slot = 0;
60e2e2fb
DC
491 wslot.bits.dev = PCI_SLOT(devfn);
492 wslot.bits.func = PCI_FUNC(devfn);
4daace0d
JO
493
494 return wslot.slot;
495}
496
497/**
498 * wslot_to_devfn() - Convert from Windows PCI slot to Linux
499 * @wslot: The Windows representation of PCI slot
500 *
501 * Windows uses a slightly different representation of PCI slot.
502 *
503 * Return: The Linux representation
504 */
505static int wslot_to_devfn(u32 wslot)
506{
507 union win_slot_encoding slot_no;
508
509 slot_no.slot = wslot;
60e2e2fb 510 return PCI_DEVFN(slot_no.bits.dev, slot_no.bits.func);
4daace0d
JO
511}
512
513/*
514 * PCI Configuration Space for these root PCI buses is implemented as a pair
515 * of pages in memory-mapped I/O space. Writing to the first page chooses
516 * the PCI function being written or read. Once the first page has been
517 * written to, the following page maps in the entire configuration space of
518 * the function.
519 */
520
521/**
522 * _hv_pcifront_read_config() - Internal PCI config read
523 * @hpdev: The PCI driver's representation of the device
524 * @where: Offset within config space
525 * @size: Size of the transfer
526 * @val: Pointer to the buffer receiving the data
527 */
528static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where,
529 int size, u32 *val)
530{
531 unsigned long flags;
532 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
533
534 /*
535 * If the attempt is to read the IDs or the ROM BAR, simulate that.
536 */
537 if (where + size <= PCI_COMMAND) {
538 memcpy(val, ((u8 *)&hpdev->desc.v_id) + where, size);
539 } else if (where >= PCI_CLASS_REVISION && where + size <=
540 PCI_CACHE_LINE_SIZE) {
541 memcpy(val, ((u8 *)&hpdev->desc.rev) + where -
542 PCI_CLASS_REVISION, size);
543 } else if (where >= PCI_SUBSYSTEM_VENDOR_ID && where + size <=
544 PCI_ROM_ADDRESS) {
545 memcpy(val, (u8 *)&hpdev->desc.subsystem_id + where -
546 PCI_SUBSYSTEM_VENDOR_ID, size);
547 } else if (where >= PCI_ROM_ADDRESS && where + size <=
548 PCI_CAPABILITY_LIST) {
549 /* ROM BARs are unimplemented */
550 *val = 0;
551 } else if (where >= PCI_INTERRUPT_LINE && where + size <=
552 PCI_INTERRUPT_PIN) {
553 /*
554 * Interrupt Line and Interrupt PIN are hard-wired to zero
555 * because this front-end only supports message-signaled
556 * interrupts.
557 */
558 *val = 0;
559 } else if (where + size <= CFG_PAGE_SIZE) {
560 spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
561 /* Choose the function to be read. (See comment above) */
562 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
bdd74440
VK
563 /* Make sure the function was chosen before we start reading. */
564 mb();
4daace0d
JO
565 /* Read from that function's config space. */
566 switch (size) {
567 case 1:
568 *val = readb(addr);
569 break;
570 case 2:
571 *val = readw(addr);
572 break;
573 default:
574 *val = readl(addr);
575 break;
576 }
bdd74440
VK
577 /*
578 * Make sure the write was done before we release the spinlock
579 * allowing consecutive reads/writes.
580 */
581 mb();
4daace0d
JO
582 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
583 } else {
584 dev_err(&hpdev->hbus->hdev->device,
585 "Attempt to read beyond a function's config space.\n");
586 }
587}
588
589/**
590 * _hv_pcifront_write_config() - Internal PCI config write
591 * @hpdev: The PCI driver's representation of the device
592 * @where: Offset within config space
593 * @size: Size of the transfer
594 * @val: The data being transferred
595 */
596static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where,
597 int size, u32 val)
598{
599 unsigned long flags;
600 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
601
602 if (where >= PCI_SUBSYSTEM_VENDOR_ID &&
603 where + size <= PCI_CAPABILITY_LIST) {
604 /* SSIDs and ROM BARs are read-only */
605 } else if (where >= PCI_COMMAND && where + size <= CFG_PAGE_SIZE) {
606 spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
607 /* Choose the function to be written. (See comment above) */
608 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
bdd74440
VK
609 /* Make sure the function was chosen before we start writing. */
610 wmb();
4daace0d
JO
611 /* Write to that function's config space. */
612 switch (size) {
613 case 1:
614 writeb(val, addr);
615 break;
616 case 2:
617 writew(val, addr);
618 break;
619 default:
620 writel(val, addr);
621 break;
622 }
bdd74440
VK
623 /*
624 * Make sure the write was done before we release the spinlock
625 * allowing consecutive reads/writes.
626 */
627 mb();
4daace0d
JO
628 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
629 } else {
630 dev_err(&hpdev->hbus->hdev->device,
631 "Attempt to write beyond a function's config space.\n");
632 }
633}
634
635/**
636 * hv_pcifront_read_config() - Read configuration space
637 * @bus: PCI Bus structure
638 * @devfn: Device/function
639 * @where: Offset from base
640 * @size: Byte/word/dword
641 * @val: Value to be read
642 *
643 * Return: PCIBIOS_SUCCESSFUL on success
644 * PCIBIOS_DEVICE_NOT_FOUND on failure
645 */
646static int hv_pcifront_read_config(struct pci_bus *bus, unsigned int devfn,
647 int where, int size, u32 *val)
648{
649 struct hv_pcibus_device *hbus =
650 container_of(bus->sysdata, struct hv_pcibus_device, sysdata);
651 struct hv_pci_dev *hpdev;
652
653 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn));
654 if (!hpdev)
655 return PCIBIOS_DEVICE_NOT_FOUND;
656
657 _hv_pcifront_read_config(hpdev, where, size, val);
658
659 put_pcichild(hpdev, hv_pcidev_ref_by_slot);
660 return PCIBIOS_SUCCESSFUL;
661}
662
663/**
664 * hv_pcifront_write_config() - Write configuration space
665 * @bus: PCI Bus structure
666 * @devfn: Device/function
667 * @where: Offset from base
668 * @size: Byte/word/dword
669 * @val: Value to be written to device
670 *
671 * Return: PCIBIOS_SUCCESSFUL on success
672 * PCIBIOS_DEVICE_NOT_FOUND on failure
673 */
674static int hv_pcifront_write_config(struct pci_bus *bus, unsigned int devfn,
675 int where, int size, u32 val)
676{
677 struct hv_pcibus_device *hbus =
678 container_of(bus->sysdata, struct hv_pcibus_device, sysdata);
679 struct hv_pci_dev *hpdev;
680
681 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn));
682 if (!hpdev)
683 return PCIBIOS_DEVICE_NOT_FOUND;
684
685 _hv_pcifront_write_config(hpdev, where, size, val);
686
687 put_pcichild(hpdev, hv_pcidev_ref_by_slot);
688 return PCIBIOS_SUCCESSFUL;
689}
690
691/* PCIe operations */
692static struct pci_ops hv_pcifront_ops = {
693 .read = hv_pcifront_read_config,
694 .write = hv_pcifront_write_config,
695};
696
697/* Interrupt management hooks */
698static void hv_int_desc_free(struct hv_pci_dev *hpdev,
699 struct tran_int_desc *int_desc)
700{
701 struct pci_delete_interrupt *int_pkt;
702 struct {
703 struct pci_packet pkt;
0c6045d8 704 u8 buffer[sizeof(struct pci_delete_interrupt)];
4daace0d
JO
705 } ctxt;
706
707 memset(&ctxt, 0, sizeof(ctxt));
708 int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message;
0c6045d8 709 int_pkt->message_type.type =
4daace0d
JO
710 PCI_DELETE_INTERRUPT_MESSAGE;
711 int_pkt->wslot.slot = hpdev->desc.win_slot.slot;
712 int_pkt->int_desc = *int_desc;
713 vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt, sizeof(*int_pkt),
714 (unsigned long)&ctxt.pkt, VM_PKT_DATA_INBAND, 0);
715 kfree(int_desc);
716}
717
718/**
719 * hv_msi_free() - Free the MSI.
720 * @domain: The interrupt domain pointer
721 * @info: Extra MSI-related context
722 * @irq: Identifies the IRQ.
723 *
724 * The Hyper-V parent partition and hypervisor are tracking the
725 * messages that are in use, keeping the interrupt redirection
726 * table up to date. This callback sends a message that frees
727 * the IRT entry and related tracking nonsense.
728 */
729static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info,
730 unsigned int irq)
731{
732 struct hv_pcibus_device *hbus;
733 struct hv_pci_dev *hpdev;
734 struct pci_dev *pdev;
735 struct tran_int_desc *int_desc;
736 struct irq_data *irq_data = irq_domain_get_irq_data(domain, irq);
737 struct msi_desc *msi = irq_data_get_msi_desc(irq_data);
738
739 pdev = msi_desc_to_pci_dev(msi);
740 hbus = info->data;
0c6e617f
CA
741 int_desc = irq_data_get_irq_chip_data(irq_data);
742 if (!int_desc)
4daace0d
JO
743 return;
744
0c6e617f
CA
745 irq_data->chip_data = NULL;
746 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
747 if (!hpdev) {
748 kfree(int_desc);
749 return;
4daace0d
JO
750 }
751
0c6e617f 752 hv_int_desc_free(hpdev, int_desc);
4daace0d
JO
753 put_pcichild(hpdev, hv_pcidev_ref_by_slot);
754}
755
756static int hv_set_affinity(struct irq_data *data, const struct cpumask *dest,
757 bool force)
758{
759 struct irq_data *parent = data->parent_data;
760
761 return parent->chip->irq_set_affinity(parent, dest, force);
762}
763
542ccf45 764static void hv_irq_mask(struct irq_data *data)
4daace0d
JO
765{
766 pci_msi_mask_irq(data);
767}
768
769/**
770 * hv_irq_unmask() - "Unmask" the IRQ by setting its current
771 * affinity.
772 * @data: Describes the IRQ
773 *
774 * Build new a destination for the MSI and make a hypercall to
775 * update the Interrupt Redirection Table. "Device Logical ID"
776 * is built out of this PCI bus's instance GUID and the function
777 * number of the device.
778 */
542ccf45 779static void hv_irq_unmask(struct irq_data *data)
4daace0d
JO
780{
781 struct msi_desc *msi_desc = irq_data_get_msi_desc(data);
782 struct irq_cfg *cfg = irqd_cfg(data);
0de8ce3e 783 struct retarget_msi_interrupt *params;
4daace0d
JO
784 struct hv_pcibus_device *hbus;
785 struct cpumask *dest;
786 struct pci_bus *pbus;
787 struct pci_dev *pdev;
788 int cpu;
0de8ce3e 789 unsigned long flags;
4daace0d
JO
790
791 dest = irq_data_get_affinity_mask(data);
792 pdev = msi_desc_to_pci_dev(msi_desc);
793 pbus = pdev->bus;
794 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
795
0de8ce3e
LL
796 spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags);
797
798 params = &hbus->retarget_msi_interrupt_params;
799 memset(params, 0, sizeof(*params));
800 params->partition_id = HV_PARTITION_ID_SELF;
801 params->source = 1; /* MSI(-X) */
802 params->address = msi_desc->msg.address_lo;
803 params->data = msi_desc->msg.data;
804 params->device_id = (hbus->hdev->dev_instance.b[5] << 24) |
4daace0d
JO
805 (hbus->hdev->dev_instance.b[4] << 16) |
806 (hbus->hdev->dev_instance.b[7] << 8) |
807 (hbus->hdev->dev_instance.b[6] & 0xf8) |
808 PCI_FUNC(pdev->devfn);
0de8ce3e 809 params->vector = cfg->vector;
4daace0d
JO
810
811 for_each_cpu_and(cpu, dest, cpu_online_mask)
0de8ce3e
LL
812 params->vp_mask |= (1ULL << vmbus_cpu_number_to_vp_number(cpu));
813
814 hv_do_hypercall(HVCALL_RETARGET_INTERRUPT, params, NULL);
4daace0d 815
0de8ce3e 816 spin_unlock_irqrestore(&hbus->retarget_msi_interrupt_lock, flags);
4daace0d
JO
817
818 pci_msi_unmask_irq(data);
819}
820
821struct compose_comp_ctxt {
822 struct hv_pci_compl comp_pkt;
823 struct tran_int_desc int_desc;
824};
825
826static void hv_pci_compose_compl(void *context, struct pci_response *resp,
827 int resp_packet_size)
828{
829 struct compose_comp_ctxt *comp_pkt = context;
830 struct pci_create_int_response *int_resp =
831 (struct pci_create_int_response *)resp;
832
833 comp_pkt->comp_pkt.completion_status = resp->status;
834 comp_pkt->int_desc = int_resp->int_desc;
835 complete(&comp_pkt->comp_pkt.host_event);
836}
837
838/**
839 * hv_compose_msi_msg() - Supplies a valid MSI address/data
840 * @data: Everything about this MSI
841 * @msg: Buffer that is filled in by this function
842 *
843 * This function unpacks the IRQ looking for target CPU set, IDT
844 * vector and mode and sends a message to the parent partition
845 * asking for a mapping for that tuple in this partition. The
846 * response supplies a data value and address to which that data
847 * should be written to trigger that interrupt.
848 */
849static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
850{
851 struct irq_cfg *cfg = irqd_cfg(data);
852 struct hv_pcibus_device *hbus;
853 struct hv_pci_dev *hpdev;
854 struct pci_bus *pbus;
855 struct pci_dev *pdev;
856 struct pci_create_interrupt *int_pkt;
857 struct compose_comp_ctxt comp;
858 struct tran_int_desc *int_desc;
859 struct cpumask *affinity;
860 struct {
861 struct pci_packet pkt;
0c6045d8 862 u8 buffer[sizeof(struct pci_create_interrupt)];
4daace0d
JO
863 } ctxt;
864 int cpu;
865 int ret;
866
867 pdev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data));
868 pbus = pdev->bus;
869 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
870 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
871 if (!hpdev)
872 goto return_null_message;
873
874 /* Free any previous message that might have already been composed. */
875 if (data->chip_data) {
876 int_desc = data->chip_data;
877 data->chip_data = NULL;
878 hv_int_desc_free(hpdev, int_desc);
879 }
880
881 int_desc = kzalloc(sizeof(*int_desc), GFP_KERNEL);
882 if (!int_desc)
883 goto drop_reference;
884
885 memset(&ctxt, 0, sizeof(ctxt));
886 init_completion(&comp.comp_pkt.host_event);
887 ctxt.pkt.completion_func = hv_pci_compose_compl;
888 ctxt.pkt.compl_ctxt = &comp;
889 int_pkt = (struct pci_create_interrupt *)&ctxt.pkt.message;
0c6045d8 890 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE;
4daace0d
JO
891 int_pkt->wslot.slot = hpdev->desc.win_slot.slot;
892 int_pkt->int_desc.vector = cfg->vector;
893 int_pkt->int_desc.vector_count = 1;
894 int_pkt->int_desc.delivery_mode =
895 (apic->irq_delivery_mode == dest_LowestPrio) ? 1 : 0;
896
897 /*
898 * This bit doesn't have to work on machines with more than 64
899 * processors because Hyper-V only supports 64 in a guest.
900 */
901 affinity = irq_data_get_affinity_mask(data);
433fcf6b
S
902 if (cpumask_weight(affinity) >= 32) {
903 int_pkt->int_desc.cpu_mask = CPU_AFFINITY_ALL;
904 } else {
905 for_each_cpu_and(cpu, affinity, cpu_online_mask) {
906 int_pkt->int_desc.cpu_mask |=
907 (1ULL << vmbus_cpu_number_to_vp_number(cpu));
908 }
4daace0d
JO
909 }
910
911 ret = vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt,
912 sizeof(*int_pkt), (unsigned long)&ctxt.pkt,
913 VM_PKT_DATA_INBAND,
914 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
665e2245
DC
915 if (ret)
916 goto free_int_desc;
917
918 wait_for_completion(&comp.comp_pkt.host_event);
4daace0d
JO
919
920 if (comp.comp_pkt.completion_status < 0) {
921 dev_err(&hbus->hdev->device,
922 "Request for interrupt failed: 0x%x",
923 comp.comp_pkt.completion_status);
924 goto free_int_desc;
925 }
926
927 /*
928 * Record the assignment so that this can be unwound later. Using
929 * irq_set_chip_data() here would be appropriate, but the lock it takes
930 * is already held.
931 */
932 *int_desc = comp.int_desc;
933 data->chip_data = int_desc;
934
935 /* Pass up the result. */
936 msg->address_hi = comp.int_desc.address >> 32;
937 msg->address_lo = comp.int_desc.address & 0xffffffff;
938 msg->data = comp.int_desc.data;
939
940 put_pcichild(hpdev, hv_pcidev_ref_by_slot);
941 return;
942
943free_int_desc:
944 kfree(int_desc);
945drop_reference:
946 put_pcichild(hpdev, hv_pcidev_ref_by_slot);
947return_null_message:
948 msg->address_hi = 0;
949 msg->address_lo = 0;
950 msg->data = 0;
951}
952
953/* HW Interrupt Chip Descriptor */
954static struct irq_chip hv_msi_irq_chip = {
955 .name = "Hyper-V PCIe MSI",
956 .irq_compose_msi_msg = hv_compose_msi_msg,
957 .irq_set_affinity = hv_set_affinity,
958 .irq_ack = irq_chip_ack_parent,
959 .irq_mask = hv_irq_mask,
960 .irq_unmask = hv_irq_unmask,
961};
962
963static irq_hw_number_t hv_msi_domain_ops_get_hwirq(struct msi_domain_info *info,
964 msi_alloc_info_t *arg)
965{
966 return arg->msi_hwirq;
967}
968
969static struct msi_domain_ops hv_msi_ops = {
970 .get_hwirq = hv_msi_domain_ops_get_hwirq,
971 .msi_prepare = pci_msi_prepare,
972 .set_desc = pci_msi_set_desc,
973 .msi_free = hv_msi_free,
974};
975
976/**
977 * hv_pcie_init_irq_domain() - Initialize IRQ domain
978 * @hbus: The root PCI bus
979 *
980 * This function creates an IRQ domain which will be used for
981 * interrupts from devices that have been passed through. These
982 * devices only support MSI and MSI-X, not line-based interrupts
983 * or simulations of line-based interrupts through PCIe's
984 * fabric-layer messages. Because interrupts are remapped, we
985 * can support multi-message MSI here.
986 *
987 * Return: '0' on success and error value on failure
988 */
989static int hv_pcie_init_irq_domain(struct hv_pcibus_device *hbus)
990{
991 hbus->msi_info.chip = &hv_msi_irq_chip;
992 hbus->msi_info.ops = &hv_msi_ops;
993 hbus->msi_info.flags = (MSI_FLAG_USE_DEF_DOM_OPS |
994 MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI |
995 MSI_FLAG_PCI_MSIX);
996 hbus->msi_info.handler = handle_edge_irq;
997 hbus->msi_info.handler_name = "edge";
998 hbus->msi_info.data = hbus;
999 hbus->irq_domain = pci_msi_create_irq_domain(hbus->sysdata.fwnode,
1000 &hbus->msi_info,
1001 x86_vector_domain);
1002 if (!hbus->irq_domain) {
1003 dev_err(&hbus->hdev->device,
1004 "Failed to build an MSI IRQ domain\n");
1005 return -ENODEV;
1006 }
1007
1008 return 0;
1009}
1010
1011/**
1012 * get_bar_size() - Get the address space consumed by a BAR
1013 * @bar_val: Value that a BAR returned after -1 was written
1014 * to it.
1015 *
1016 * This function returns the size of the BAR, rounded up to 1
1017 * page. It has to be rounded up because the hypervisor's page
1018 * table entry that maps the BAR into the VM can't specify an
1019 * offset within a page. The invariant is that the hypervisor
1020 * must place any BARs of smaller than page length at the
1021 * beginning of a page.
1022 *
1023 * Return: Size in bytes of the consumed MMIO space.
1024 */
1025static u64 get_bar_size(u64 bar_val)
1026{
1027 return round_up((1 + ~(bar_val & PCI_BASE_ADDRESS_MEM_MASK)),
1028 PAGE_SIZE);
1029}
1030
1031/**
1032 * survey_child_resources() - Total all MMIO requirements
1033 * @hbus: Root PCI bus, as understood by this driver
1034 */
1035static void survey_child_resources(struct hv_pcibus_device *hbus)
1036{
1037 struct list_head *iter;
1038 struct hv_pci_dev *hpdev;
1039 resource_size_t bar_size = 0;
1040 unsigned long flags;
1041 struct completion *event;
1042 u64 bar_val;
1043 int i;
1044
1045 /* If nobody is waiting on the answer, don't compute it. */
1046 event = xchg(&hbus->survey_event, NULL);
1047 if (!event)
1048 return;
1049
1050 /* If the answer has already been computed, go with it. */
1051 if (hbus->low_mmio_space || hbus->high_mmio_space) {
1052 complete(event);
1053 return;
1054 }
1055
1056 spin_lock_irqsave(&hbus->device_list_lock, flags);
1057
1058 /*
1059 * Due to an interesting quirk of the PCI spec, all memory regions
1060 * for a child device are a power of 2 in size and aligned in memory,
1061 * so it's sufficient to just add them up without tracking alignment.
1062 */
1063 list_for_each(iter, &hbus->children) {
1064 hpdev = container_of(iter, struct hv_pci_dev, list_entry);
1065 for (i = 0; i < 6; i++) {
1066 if (hpdev->probed_bar[i] & PCI_BASE_ADDRESS_SPACE_IO)
1067 dev_err(&hbus->hdev->device,
1068 "There's an I/O BAR in this list!\n");
1069
1070 if (hpdev->probed_bar[i] != 0) {
1071 /*
1072 * A probed BAR has all the upper bits set that
1073 * can be changed.
1074 */
1075
1076 bar_val = hpdev->probed_bar[i];
1077 if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64)
1078 bar_val |=
1079 ((u64)hpdev->probed_bar[++i] << 32);
1080 else
1081 bar_val |= 0xffffffff00000000ULL;
1082
1083 bar_size = get_bar_size(bar_val);
1084
1085 if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64)
1086 hbus->high_mmio_space += bar_size;
1087 else
1088 hbus->low_mmio_space += bar_size;
1089 }
1090 }
1091 }
1092
1093 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1094 complete(event);
1095}
1096
1097/**
1098 * prepopulate_bars() - Fill in BARs with defaults
1099 * @hbus: Root PCI bus, as understood by this driver
1100 *
1101 * The core PCI driver code seems much, much happier if the BARs
1102 * for a device have values upon first scan. So fill them in.
1103 * The algorithm below works down from large sizes to small,
1104 * attempting to pack the assignments optimally. The assumption,
1105 * enforced in other parts of the code, is that the beginning of
1106 * the memory-mapped I/O space will be aligned on the largest
1107 * BAR size.
1108 */
1109static void prepopulate_bars(struct hv_pcibus_device *hbus)
1110{
1111 resource_size_t high_size = 0;
1112 resource_size_t low_size = 0;
1113 resource_size_t high_base = 0;
1114 resource_size_t low_base = 0;
1115 resource_size_t bar_size;
1116 struct hv_pci_dev *hpdev;
1117 struct list_head *iter;
1118 unsigned long flags;
1119 u64 bar_val;
1120 u32 command;
1121 bool high;
1122 int i;
1123
1124 if (hbus->low_mmio_space) {
1125 low_size = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space));
1126 low_base = hbus->low_mmio_res->start;
1127 }
1128
1129 if (hbus->high_mmio_space) {
1130 high_size = 1ULL <<
1131 (63 - __builtin_clzll(hbus->high_mmio_space));
1132 high_base = hbus->high_mmio_res->start;
1133 }
1134
1135 spin_lock_irqsave(&hbus->device_list_lock, flags);
1136
1137 /* Pick addresses for the BARs. */
1138 do {
1139 list_for_each(iter, &hbus->children) {
1140 hpdev = container_of(iter, struct hv_pci_dev,
1141 list_entry);
1142 for (i = 0; i < 6; i++) {
1143 bar_val = hpdev->probed_bar[i];
1144 if (bar_val == 0)
1145 continue;
1146 high = bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64;
1147 if (high) {
1148 bar_val |=
1149 ((u64)hpdev->probed_bar[i + 1]
1150 << 32);
1151 } else {
1152 bar_val |= 0xffffffffULL << 32;
1153 }
1154 bar_size = get_bar_size(bar_val);
1155 if (high) {
1156 if (high_size != bar_size) {
1157 i++;
1158 continue;
1159 }
1160 _hv_pcifront_write_config(hpdev,
1161 PCI_BASE_ADDRESS_0 + (4 * i),
1162 4,
1163 (u32)(high_base & 0xffffff00));
1164 i++;
1165 _hv_pcifront_write_config(hpdev,
1166 PCI_BASE_ADDRESS_0 + (4 * i),
1167 4, (u32)(high_base >> 32));
1168 high_base += bar_size;
1169 } else {
1170 if (low_size != bar_size)
1171 continue;
1172 _hv_pcifront_write_config(hpdev,
1173 PCI_BASE_ADDRESS_0 + (4 * i),
1174 4,
1175 (u32)(low_base & 0xffffff00));
1176 low_base += bar_size;
1177 }
1178 }
1179 if (high_size <= 1 && low_size <= 1) {
1180 /* Set the memory enable bit. */
1181 _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2,
1182 &command);
1183 command |= PCI_COMMAND_MEMORY;
1184 _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2,
1185 command);
1186 break;
1187 }
1188 }
1189
1190 high_size >>= 1;
1191 low_size >>= 1;
1192 } while (high_size || low_size);
1193
1194 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1195}
1196
1197/**
1198 * create_root_hv_pci_bus() - Expose a new root PCI bus
1199 * @hbus: Root PCI bus, as understood by this driver
1200 *
1201 * Return: 0 on success, -errno on failure
1202 */
1203static int create_root_hv_pci_bus(struct hv_pcibus_device *hbus)
1204{
1205 /* Register the device */
1206 hbus->pci_bus = pci_create_root_bus(&hbus->hdev->device,
1207 0, /* bus number is always zero */
1208 &hv_pcifront_ops,
1209 &hbus->sysdata,
1210 &hbus->resources_for_children);
1211 if (!hbus->pci_bus)
1212 return -ENODEV;
1213
1214 hbus->pci_bus->msi = &hbus->msi_chip;
1215 hbus->pci_bus->msi->dev = &hbus->hdev->device;
1216
414428c5 1217 pci_lock_rescan_remove();
4daace0d
JO
1218 pci_scan_child_bus(hbus->pci_bus);
1219 pci_bus_assign_resources(hbus->pci_bus);
1220 pci_bus_add_devices(hbus->pci_bus);
414428c5 1221 pci_unlock_rescan_remove();
4daace0d
JO
1222 hbus->state = hv_pcibus_installed;
1223 return 0;
1224}
1225
1226struct q_res_req_compl {
1227 struct completion host_event;
1228 struct hv_pci_dev *hpdev;
1229};
1230
1231/**
1232 * q_resource_requirements() - Query Resource Requirements
1233 * @context: The completion context.
1234 * @resp: The response that came from the host.
1235 * @resp_packet_size: The size in bytes of resp.
1236 *
1237 * This function is invoked on completion of a Query Resource
1238 * Requirements packet.
1239 */
1240static void q_resource_requirements(void *context, struct pci_response *resp,
1241 int resp_packet_size)
1242{
1243 struct q_res_req_compl *completion = context;
1244 struct pci_q_res_req_response *q_res_req =
1245 (struct pci_q_res_req_response *)resp;
1246 int i;
1247
1248 if (resp->status < 0) {
1249 dev_err(&completion->hpdev->hbus->hdev->device,
1250 "query resource requirements failed: %x\n",
1251 resp->status);
1252 } else {
1253 for (i = 0; i < 6; i++) {
1254 completion->hpdev->probed_bar[i] =
1255 q_res_req->probed_bar[i];
1256 }
1257 }
1258
1259 complete(&completion->host_event);
1260}
1261
1262static void get_pcichild(struct hv_pci_dev *hpdev,
1263 enum hv_pcidev_ref_reason reason)
1264{
1265 atomic_inc(&hpdev->refs);
1266}
1267
1268static void put_pcichild(struct hv_pci_dev *hpdev,
1269 enum hv_pcidev_ref_reason reason)
1270{
1271 if (atomic_dec_and_test(&hpdev->refs))
1272 kfree(hpdev);
1273}
1274
1275/**
1276 * new_pcichild_device() - Create a new child device
1277 * @hbus: The internal struct tracking this root PCI bus.
1278 * @desc: The information supplied so far from the host
1279 * about the device.
1280 *
1281 * This function creates the tracking structure for a new child
1282 * device and kicks off the process of figuring out what it is.
1283 *
1284 * Return: Pointer to the new tracking struct
1285 */
1286static struct hv_pci_dev *new_pcichild_device(struct hv_pcibus_device *hbus,
1287 struct pci_function_description *desc)
1288{
1289 struct hv_pci_dev *hpdev;
1290 struct pci_child_message *res_req;
1291 struct q_res_req_compl comp_pkt;
8286e96d
DC
1292 struct {
1293 struct pci_packet init_packet;
1294 u8 buffer[sizeof(struct pci_child_message)];
4daace0d
JO
1295 } pkt;
1296 unsigned long flags;
1297 int ret;
1298
1299 hpdev = kzalloc(sizeof(*hpdev), GFP_ATOMIC);
1300 if (!hpdev)
1301 return NULL;
1302
1303 hpdev->hbus = hbus;
1304
1305 memset(&pkt, 0, sizeof(pkt));
1306 init_completion(&comp_pkt.host_event);
1307 comp_pkt.hpdev = hpdev;
1308 pkt.init_packet.compl_ctxt = &comp_pkt;
1309 pkt.init_packet.completion_func = q_resource_requirements;
1310 res_req = (struct pci_child_message *)&pkt.init_packet.message;
0c6045d8 1311 res_req->message_type.type = PCI_QUERY_RESOURCE_REQUIREMENTS;
4daace0d
JO
1312 res_req->wslot.slot = desc->win_slot.slot;
1313
1314 ret = vmbus_sendpacket(hbus->hdev->channel, res_req,
1315 sizeof(struct pci_child_message),
1316 (unsigned long)&pkt.init_packet,
1317 VM_PKT_DATA_INBAND,
1318 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
1319 if (ret)
1320 goto error;
1321
1322 wait_for_completion(&comp_pkt.host_event);
1323
1324 hpdev->desc = *desc;
1325 get_pcichild(hpdev, hv_pcidev_ref_initial);
1326 get_pcichild(hpdev, hv_pcidev_ref_childlist);
1327 spin_lock_irqsave(&hbus->device_list_lock, flags);
4a9b0933
HZ
1328
1329 /*
1330 * When a device is being added to the bus, we set the PCI domain
1331 * number to be the device serial number, which is non-zero and
1332 * unique on the same VM. The serial numbers start with 1, and
1333 * increase by 1 for each device. So device names including this
1334 * can have shorter names than based on the bus instance UUID.
1335 * Only the first device serial number is used for domain, so the
1336 * domain number will not change after the first device is added.
1337 */
1338 if (list_empty(&hbus->children))
1339 hbus->sysdata.domain = desc->ser;
4daace0d
JO
1340 list_add_tail(&hpdev->list_entry, &hbus->children);
1341 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1342 return hpdev;
1343
1344error:
1345 kfree(hpdev);
1346 return NULL;
1347}
1348
1349/**
1350 * get_pcichild_wslot() - Find device from slot
1351 * @hbus: Root PCI bus, as understood by this driver
1352 * @wslot: Location on the bus
1353 *
1354 * This function looks up a PCI device and returns the internal
1355 * representation of it. It acquires a reference on it, so that
1356 * the device won't be deleted while somebody is using it. The
1357 * caller is responsible for calling put_pcichild() to release
1358 * this reference.
1359 *
1360 * Return: Internal representation of a PCI device
1361 */
1362static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus,
1363 u32 wslot)
1364{
1365 unsigned long flags;
1366 struct hv_pci_dev *iter, *hpdev = NULL;
1367
1368 spin_lock_irqsave(&hbus->device_list_lock, flags);
1369 list_for_each_entry(iter, &hbus->children, list_entry) {
1370 if (iter->desc.win_slot.slot == wslot) {
1371 hpdev = iter;
1372 get_pcichild(hpdev, hv_pcidev_ref_by_slot);
1373 break;
1374 }
1375 }
1376 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1377
1378 return hpdev;
1379}
1380
1381/**
1382 * pci_devices_present_work() - Handle new list of child devices
1383 * @work: Work struct embedded in struct hv_dr_work
1384 *
1385 * "Bus Relations" is the Windows term for "children of this
1386 * bus." The terminology is preserved here for people trying to
1387 * debug the interaction between Hyper-V and Linux. This
1388 * function is called when the parent partition reports a list
1389 * of functions that should be observed under this PCI Express
1390 * port (bus).
1391 *
1392 * This function updates the list, and must tolerate being
1393 * called multiple times with the same information. The typical
1394 * number of child devices is one, with very atypical cases
1395 * involving three or four, so the algorithms used here can be
1396 * simple and inefficient.
1397 *
1398 * It must also treat the omission of a previously observed device as
1399 * notification that the device no longer exists.
1400 *
1401 * Note that this function is a work item, and it may not be
1402 * invoked in the order that it was queued. Back to back
1403 * updates of the list of present devices may involve queuing
1404 * multiple work items, and this one may run before ones that
1405 * were sent later. As such, this function only does something
1406 * if is the last one in the queue.
1407 */
1408static void pci_devices_present_work(struct work_struct *work)
1409{
1410 u32 child_no;
1411 bool found;
1412 struct list_head *iter;
1413 struct pci_function_description *new_desc;
1414 struct hv_pci_dev *hpdev;
1415 struct hv_pcibus_device *hbus;
1416 struct list_head removed;
1417 struct hv_dr_work *dr_wrk;
1418 struct hv_dr_state *dr = NULL;
1419 unsigned long flags;
1420
1421 dr_wrk = container_of(work, struct hv_dr_work, wrk);
1422 hbus = dr_wrk->bus;
1423 kfree(dr_wrk);
1424
1425 INIT_LIST_HEAD(&removed);
1426
1427 if (down_interruptible(&hbus->enum_sem)) {
1428 put_hvpcibus(hbus);
1429 return;
1430 }
1431
1432 /* Pull this off the queue and process it if it was the last one. */
1433 spin_lock_irqsave(&hbus->device_list_lock, flags);
1434 while (!list_empty(&hbus->dr_list)) {
1435 dr = list_first_entry(&hbus->dr_list, struct hv_dr_state,
1436 list_entry);
1437 list_del(&dr->list_entry);
1438
1439 /* Throw this away if the list still has stuff in it. */
1440 if (!list_empty(&hbus->dr_list)) {
1441 kfree(dr);
1442 continue;
1443 }
1444 }
1445 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1446
1447 if (!dr) {
1448 up(&hbus->enum_sem);
1449 put_hvpcibus(hbus);
1450 return;
1451 }
1452
1453 /* First, mark all existing children as reported missing. */
1454 spin_lock_irqsave(&hbus->device_list_lock, flags);
1455 list_for_each(iter, &hbus->children) {
1456 hpdev = container_of(iter, struct hv_pci_dev,
1457 list_entry);
1458 hpdev->reported_missing = true;
1459 }
1460 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1461
1462 /* Next, add back any reported devices. */
1463 for (child_no = 0; child_no < dr->device_count; child_no++) {
1464 found = false;
1465 new_desc = &dr->func[child_no];
1466
1467 spin_lock_irqsave(&hbus->device_list_lock, flags);
1468 list_for_each(iter, &hbus->children) {
1469 hpdev = container_of(iter, struct hv_pci_dev,
1470 list_entry);
1471 if ((hpdev->desc.win_slot.slot ==
1472 new_desc->win_slot.slot) &&
1473 (hpdev->desc.v_id == new_desc->v_id) &&
1474 (hpdev->desc.d_id == new_desc->d_id) &&
1475 (hpdev->desc.ser == new_desc->ser)) {
1476 hpdev->reported_missing = false;
1477 found = true;
1478 }
1479 }
1480 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1481
1482 if (!found) {
1483 hpdev = new_pcichild_device(hbus, new_desc);
1484 if (!hpdev)
1485 dev_err(&hbus->hdev->device,
1486 "couldn't record a child device.\n");
1487 }
1488 }
1489
1490 /* Move missing children to a list on the stack. */
1491 spin_lock_irqsave(&hbus->device_list_lock, flags);
1492 do {
1493 found = false;
1494 list_for_each(iter, &hbus->children) {
1495 hpdev = container_of(iter, struct hv_pci_dev,
1496 list_entry);
1497 if (hpdev->reported_missing) {
1498 found = true;
1499 put_pcichild(hpdev, hv_pcidev_ref_childlist);
4f1cb01a 1500 list_move_tail(&hpdev->list_entry, &removed);
4daace0d
JO
1501 break;
1502 }
1503 }
1504 } while (found);
1505 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1506
1507 /* Delete everything that should no longer exist. */
1508 while (!list_empty(&removed)) {
1509 hpdev = list_first_entry(&removed, struct hv_pci_dev,
1510 list_entry);
1511 list_del(&hpdev->list_entry);
1512 put_pcichild(hpdev, hv_pcidev_ref_initial);
1513 }
1514
d3a78d8b
LL
1515 switch(hbus->state) {
1516 case hv_pcibus_installed:
1517 /*
1518 * Tell the core to rescan bus
1519 * because there may have been changes.
1520 */
4daace0d
JO
1521 pci_lock_rescan_remove();
1522 pci_scan_child_bus(hbus->pci_bus);
1523 pci_unlock_rescan_remove();
d3a78d8b
LL
1524 break;
1525
1526 case hv_pcibus_init:
1527 case hv_pcibus_probed:
4daace0d 1528 survey_child_resources(hbus);
d3a78d8b
LL
1529 break;
1530
1531 default:
1532 break;
4daace0d
JO
1533 }
1534
1535 up(&hbus->enum_sem);
1536 put_hvpcibus(hbus);
1537 kfree(dr);
1538}
1539
1540/**
1541 * hv_pci_devices_present() - Handles list of new children
1542 * @hbus: Root PCI bus, as understood by this driver
1543 * @relations: Packet from host listing children
1544 *
1545 * This function is invoked whenever a new list of devices for
1546 * this bus appears.
1547 */
1548static void hv_pci_devices_present(struct hv_pcibus_device *hbus,
1549 struct pci_bus_relations *relations)
1550{
1551 struct hv_dr_state *dr;
1552 struct hv_dr_work *dr_wrk;
1553 unsigned long flags;
1554
1555 dr_wrk = kzalloc(sizeof(*dr_wrk), GFP_NOWAIT);
1556 if (!dr_wrk)
1557 return;
1558
1559 dr = kzalloc(offsetof(struct hv_dr_state, func) +
1560 (sizeof(struct pci_function_description) *
1561 (relations->device_count)), GFP_NOWAIT);
1562 if (!dr) {
1563 kfree(dr_wrk);
1564 return;
1565 }
1566
1567 INIT_WORK(&dr_wrk->wrk, pci_devices_present_work);
1568 dr_wrk->bus = hbus;
1569 dr->device_count = relations->device_count;
1570 if (dr->device_count != 0) {
1571 memcpy(dr->func, relations->func,
1572 sizeof(struct pci_function_description) *
1573 dr->device_count);
1574 }
1575
1576 spin_lock_irqsave(&hbus->device_list_lock, flags);
1577 list_add_tail(&dr->list_entry, &hbus->dr_list);
1578 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1579
1580 get_hvpcibus(hbus);
1581 schedule_work(&dr_wrk->wrk);
1582}
1583
1584/**
1585 * hv_eject_device_work() - Asynchronously handles ejection
1586 * @work: Work struct embedded in internal device struct
1587 *
1588 * This function handles ejecting a device. Windows will
1589 * attempt to gracefully eject a device, waiting 60 seconds to
1590 * hear back from the guest OS that this completed successfully.
1591 * If this timer expires, the device will be forcibly removed.
1592 */
1593static void hv_eject_device_work(struct work_struct *work)
1594{
1595 struct pci_eject_response *ejct_pkt;
1596 struct hv_pci_dev *hpdev;
1597 struct pci_dev *pdev;
1598 unsigned long flags;
1599 int wslot;
1600 struct {
1601 struct pci_packet pkt;
0c6045d8 1602 u8 buffer[sizeof(struct pci_eject_response)];
4daace0d
JO
1603 } ctxt;
1604
1605 hpdev = container_of(work, struct hv_pci_dev, wrk);
1606
1607 if (hpdev->state != hv_pcichild_ejecting) {
1608 put_pcichild(hpdev, hv_pcidev_ref_pnp);
1609 return;
1610 }
1611
1612 /*
1613 * Ejection can come before or after the PCI bus has been set up, so
1614 * attempt to find it and tear down the bus state, if it exists. This
1615 * must be done without constructs like pci_domain_nr(hbus->pci_bus)
1616 * because hbus->pci_bus may not exist yet.
1617 */
1618 wslot = wslot_to_devfn(hpdev->desc.win_slot.slot);
1619 pdev = pci_get_domain_bus_and_slot(hpdev->hbus->sysdata.domain, 0,
1620 wslot);
1621 if (pdev) {
414428c5 1622 pci_lock_rescan_remove();
4daace0d
JO
1623 pci_stop_and_remove_bus_device(pdev);
1624 pci_dev_put(pdev);
414428c5 1625 pci_unlock_rescan_remove();
4daace0d
JO
1626 }
1627
e74d2ebd
DC
1628 spin_lock_irqsave(&hpdev->hbus->device_list_lock, flags);
1629 list_del(&hpdev->list_entry);
1630 spin_unlock_irqrestore(&hpdev->hbus->device_list_lock, flags);
1631
4daace0d
JO
1632 memset(&ctxt, 0, sizeof(ctxt));
1633 ejct_pkt = (struct pci_eject_response *)&ctxt.pkt.message;
0c6045d8 1634 ejct_pkt->message_type.type = PCI_EJECTION_COMPLETE;
4daace0d
JO
1635 ejct_pkt->wslot.slot = hpdev->desc.win_slot.slot;
1636 vmbus_sendpacket(hpdev->hbus->hdev->channel, ejct_pkt,
1637 sizeof(*ejct_pkt), (unsigned long)&ctxt.pkt,
1638 VM_PKT_DATA_INBAND, 0);
1639
4daace0d
JO
1640 put_pcichild(hpdev, hv_pcidev_ref_childlist);
1641 put_pcichild(hpdev, hv_pcidev_ref_pnp);
1642 put_hvpcibus(hpdev->hbus);
1643}
1644
1645/**
1646 * hv_pci_eject_device() - Handles device ejection
1647 * @hpdev: Internal device tracking struct
1648 *
1649 * This function is invoked when an ejection packet arrives. It
1650 * just schedules work so that we don't re-enter the packet
1651 * delivery code handling the ejection.
1652 */
1653static void hv_pci_eject_device(struct hv_pci_dev *hpdev)
1654{
1655 hpdev->state = hv_pcichild_ejecting;
1656 get_pcichild(hpdev, hv_pcidev_ref_pnp);
1657 INIT_WORK(&hpdev->wrk, hv_eject_device_work);
1658 get_hvpcibus(hpdev->hbus);
1659 schedule_work(&hpdev->wrk);
1660}
1661
1662/**
1663 * hv_pci_onchannelcallback() - Handles incoming packets
1664 * @context: Internal bus tracking struct
1665 *
1666 * This function is invoked whenever the host sends a packet to
1667 * this channel (which is private to this root PCI bus).
1668 */
1669static void hv_pci_onchannelcallback(void *context)
1670{
1671 const int packet_size = 0x100;
1672 int ret;
1673 struct hv_pcibus_device *hbus = context;
1674 u32 bytes_recvd;
1675 u64 req_id;
1676 struct vmpacket_descriptor *desc;
1677 unsigned char *buffer;
1678 int bufferlen = packet_size;
1679 struct pci_packet *comp_packet;
1680 struct pci_response *response;
1681 struct pci_incoming_message *new_message;
1682 struct pci_bus_relations *bus_rel;
1683 struct pci_dev_incoming *dev_message;
1684 struct hv_pci_dev *hpdev;
1685
1686 buffer = kmalloc(bufferlen, GFP_ATOMIC);
1687 if (!buffer)
1688 return;
1689
1690 while (1) {
1691 ret = vmbus_recvpacket_raw(hbus->hdev->channel, buffer,
1692 bufferlen, &bytes_recvd, &req_id);
1693
1694 if (ret == -ENOBUFS) {
1695 kfree(buffer);
1696 /* Handle large packet */
1697 bufferlen = bytes_recvd;
1698 buffer = kmalloc(bytes_recvd, GFP_ATOMIC);
1699 if (!buffer)
1700 return;
1701 continue;
1702 }
1703
837d741e
VK
1704 /* Zero length indicates there are no more packets. */
1705 if (ret || !bytes_recvd)
1706 break;
1707
4daace0d
JO
1708 /*
1709 * All incoming packets must be at least as large as a
1710 * response.
1711 */
60fcdac8 1712 if (bytes_recvd <= sizeof(struct pci_response))
837d741e 1713 continue;
4daace0d
JO
1714 desc = (struct vmpacket_descriptor *)buffer;
1715
1716 switch (desc->type) {
1717 case VM_PKT_COMP:
1718
1719 /*
1720 * The host is trusted, and thus it's safe to interpret
1721 * this transaction ID as a pointer.
1722 */
1723 comp_packet = (struct pci_packet *)req_id;
1724 response = (struct pci_response *)buffer;
1725 comp_packet->completion_func(comp_packet->compl_ctxt,
1726 response,
1727 bytes_recvd);
60fcdac8 1728 break;
4daace0d
JO
1729
1730 case VM_PKT_DATA_INBAND:
1731
1732 new_message = (struct pci_incoming_message *)buffer;
0c6045d8 1733 switch (new_message->message_type.type) {
4daace0d
JO
1734 case PCI_BUS_RELATIONS:
1735
1736 bus_rel = (struct pci_bus_relations *)buffer;
1737 if (bytes_recvd <
1738 offsetof(struct pci_bus_relations, func) +
1739 (sizeof(struct pci_function_description) *
1740 (bus_rel->device_count))) {
1741 dev_err(&hbus->hdev->device,
1742 "bus relations too small\n");
1743 break;
1744 }
1745
1746 hv_pci_devices_present(hbus, bus_rel);
1747 break;
1748
1749 case PCI_EJECT:
1750
1751 dev_message = (struct pci_dev_incoming *)buffer;
1752 hpdev = get_pcichild_wslot(hbus,
1753 dev_message->wslot.slot);
1754 if (hpdev) {
1755 hv_pci_eject_device(hpdev);
1756 put_pcichild(hpdev,
1757 hv_pcidev_ref_by_slot);
1758 }
1759 break;
1760
1761 default:
1762 dev_warn(&hbus->hdev->device,
1763 "Unimplemented protocol message %x\n",
0c6045d8 1764 new_message->message_type.type);
4daace0d
JO
1765 break;
1766 }
1767 break;
1768
1769 default:
1770 dev_err(&hbus->hdev->device,
1771 "unhandled packet type %d, tid %llx len %d\n",
1772 desc->type, req_id, bytes_recvd);
1773 break;
1774 }
4daace0d 1775 }
60fcdac8
VK
1776
1777 kfree(buffer);
4daace0d
JO
1778}
1779
1780/**
1781 * hv_pci_protocol_negotiation() - Set up protocol
1782 * @hdev: VMBus's tracking struct for this root PCI bus
1783 *
1784 * This driver is intended to support running on Windows 10
1785 * (server) and later versions. It will not run on earlier
1786 * versions, as they assume that many of the operations which
1787 * Linux needs accomplished with a spinlock held were done via
1788 * asynchronous messaging via VMBus. Windows 10 increases the
1789 * surface area of PCI emulation so that these actions can take
1790 * place by suspending a virtual processor for their duration.
1791 *
1792 * This function negotiates the channel protocol version,
1793 * failing if the host doesn't support the necessary protocol
1794 * level.
1795 */
1796static int hv_pci_protocol_negotiation(struct hv_device *hdev)
1797{
1798 struct pci_version_request *version_req;
1799 struct hv_pci_compl comp_pkt;
1800 struct pci_packet *pkt;
1801 int ret;
1802
1803 /*
1804 * Initiate the handshake with the host and negotiate
1805 * a version that the host can support. We start with the
1806 * highest version number and go down if the host cannot
1807 * support it.
1808 */
1809 pkt = kzalloc(sizeof(*pkt) + sizeof(*version_req), GFP_KERNEL);
1810 if (!pkt)
1811 return -ENOMEM;
1812
1813 init_completion(&comp_pkt.host_event);
1814 pkt->completion_func = hv_pci_generic_compl;
1815 pkt->compl_ctxt = &comp_pkt;
1816 version_req = (struct pci_version_request *)&pkt->message;
0c6045d8 1817 version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION;
4daace0d
JO
1818 version_req->protocol_version = PCI_PROTOCOL_VERSION_CURRENT;
1819
1820 ret = vmbus_sendpacket(hdev->channel, version_req,
1821 sizeof(struct pci_version_request),
1822 (unsigned long)pkt, VM_PKT_DATA_INBAND,
1823 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
1824 if (ret)
1825 goto exit;
1826
1827 wait_for_completion(&comp_pkt.host_event);
1828
1829 if (comp_pkt.completion_status < 0) {
1830 dev_err(&hdev->device,
1831 "PCI Pass-through VSP failed version request %x\n",
1832 comp_pkt.completion_status);
1833 ret = -EPROTO;
1834 goto exit;
1835 }
1836
1837 ret = 0;
1838
1839exit:
1840 kfree(pkt);
1841 return ret;
1842}
1843
1844/**
1845 * hv_pci_free_bridge_windows() - Release memory regions for the
1846 * bus
1847 * @hbus: Root PCI bus, as understood by this driver
1848 */
1849static void hv_pci_free_bridge_windows(struct hv_pcibus_device *hbus)
1850{
1851 /*
1852 * Set the resources back to the way they looked when they
1853 * were allocated by setting IORESOURCE_BUSY again.
1854 */
1855
1856 if (hbus->low_mmio_space && hbus->low_mmio_res) {
1857 hbus->low_mmio_res->flags |= IORESOURCE_BUSY;
696ca5e8
JO
1858 vmbus_free_mmio(hbus->low_mmio_res->start,
1859 resource_size(hbus->low_mmio_res));
4daace0d
JO
1860 }
1861
1862 if (hbus->high_mmio_space && hbus->high_mmio_res) {
1863 hbus->high_mmio_res->flags |= IORESOURCE_BUSY;
696ca5e8
JO
1864 vmbus_free_mmio(hbus->high_mmio_res->start,
1865 resource_size(hbus->high_mmio_res));
4daace0d
JO
1866 }
1867}
1868
1869/**
1870 * hv_pci_allocate_bridge_windows() - Allocate memory regions
1871 * for the bus
1872 * @hbus: Root PCI bus, as understood by this driver
1873 *
1874 * This function calls vmbus_allocate_mmio(), which is itself a
1875 * bit of a compromise. Ideally, we might change the pnp layer
1876 * in the kernel such that it comprehends either PCI devices
1877 * which are "grandchildren of ACPI," with some intermediate bus
1878 * node (in this case, VMBus) or change it such that it
1879 * understands VMBus. The pnp layer, however, has been declared
1880 * deprecated, and not subject to change.
1881 *
1882 * The workaround, implemented here, is to ask VMBus to allocate
1883 * MMIO space for this bus. VMBus itself knows which ranges are
1884 * appropriate by looking at its own ACPI objects. Then, after
1885 * these ranges are claimed, they're modified to look like they
1886 * would have looked if the ACPI and pnp code had allocated
1887 * bridge windows. These descriptors have to exist in this form
1888 * in order to satisfy the code which will get invoked when the
1889 * endpoint PCI function driver calls request_mem_region() or
1890 * request_mem_region_exclusive().
1891 *
1892 * Return: 0 on success, -errno on failure
1893 */
1894static int hv_pci_allocate_bridge_windows(struct hv_pcibus_device *hbus)
1895{
1896 resource_size_t align;
1897 int ret;
1898
1899 if (hbus->low_mmio_space) {
1900 align = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space));
1901 ret = vmbus_allocate_mmio(&hbus->low_mmio_res, hbus->hdev, 0,
1902 (u64)(u32)0xffffffff,
1903 hbus->low_mmio_space,
1904 align, false);
1905 if (ret) {
1906 dev_err(&hbus->hdev->device,
1907 "Need %#llx of low MMIO space. Consider reconfiguring the VM.\n",
1908 hbus->low_mmio_space);
1909 return ret;
1910 }
1911
1912 /* Modify this resource to become a bridge window. */
1913 hbus->low_mmio_res->flags |= IORESOURCE_WINDOW;
1914 hbus->low_mmio_res->flags &= ~IORESOURCE_BUSY;
1915 pci_add_resource(&hbus->resources_for_children,
1916 hbus->low_mmio_res);
1917 }
1918
1919 if (hbus->high_mmio_space) {
1920 align = 1ULL << (63 - __builtin_clzll(hbus->high_mmio_space));
1921 ret = vmbus_allocate_mmio(&hbus->high_mmio_res, hbus->hdev,
1922 0x100000000, -1,
1923 hbus->high_mmio_space, align,
1924 false);
1925 if (ret) {
1926 dev_err(&hbus->hdev->device,
1927 "Need %#llx of high MMIO space. Consider reconfiguring the VM.\n",
1928 hbus->high_mmio_space);
1929 goto release_low_mmio;
1930 }
1931
1932 /* Modify this resource to become a bridge window. */
1933 hbus->high_mmio_res->flags |= IORESOURCE_WINDOW;
1934 hbus->high_mmio_res->flags &= ~IORESOURCE_BUSY;
1935 pci_add_resource(&hbus->resources_for_children,
1936 hbus->high_mmio_res);
1937 }
1938
1939 return 0;
1940
1941release_low_mmio:
1942 if (hbus->low_mmio_res) {
696ca5e8
JO
1943 vmbus_free_mmio(hbus->low_mmio_res->start,
1944 resource_size(hbus->low_mmio_res));
4daace0d
JO
1945 }
1946
1947 return ret;
1948}
1949
1950/**
1951 * hv_allocate_config_window() - Find MMIO space for PCI Config
1952 * @hbus: Root PCI bus, as understood by this driver
1953 *
1954 * This function claims memory-mapped I/O space for accessing
1955 * configuration space for the functions on this bus.
1956 *
1957 * Return: 0 on success, -errno on failure
1958 */
1959static int hv_allocate_config_window(struct hv_pcibus_device *hbus)
1960{
1961 int ret;
1962
1963 /*
1964 * Set up a region of MMIO space to use for accessing configuration
1965 * space.
1966 */
1967 ret = vmbus_allocate_mmio(&hbus->mem_config, hbus->hdev, 0, -1,
1968 PCI_CONFIG_MMIO_LENGTH, 0x1000, false);
1969 if (ret)
1970 return ret;
1971
1972 /*
1973 * vmbus_allocate_mmio() gets used for allocating both device endpoint
1974 * resource claims (those which cannot be overlapped) and the ranges
1975 * which are valid for the children of this bus, which are intended
1976 * to be overlapped by those children. Set the flag on this claim
1977 * meaning that this region can't be overlapped.
1978 */
1979
1980 hbus->mem_config->flags |= IORESOURCE_BUSY;
1981
1982 return 0;
1983}
1984
1985static void hv_free_config_window(struct hv_pcibus_device *hbus)
1986{
696ca5e8 1987 vmbus_free_mmio(hbus->mem_config->start, PCI_CONFIG_MMIO_LENGTH);
4daace0d
JO
1988}
1989
1990/**
1991 * hv_pci_enter_d0() - Bring the "bus" into the D0 power state
1992 * @hdev: VMBus's tracking struct for this root PCI bus
1993 *
1994 * Return: 0 on success, -errno on failure
1995 */
1996static int hv_pci_enter_d0(struct hv_device *hdev)
1997{
1998 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
1999 struct pci_bus_d0_entry *d0_entry;
2000 struct hv_pci_compl comp_pkt;
2001 struct pci_packet *pkt;
2002 int ret;
2003
2004 /*
2005 * Tell the host that the bus is ready to use, and moved into the
2006 * powered-on state. This includes telling the host which region
2007 * of memory-mapped I/O space has been chosen for configuration space
2008 * access.
2009 */
2010 pkt = kzalloc(sizeof(*pkt) + sizeof(*d0_entry), GFP_KERNEL);
2011 if (!pkt)
2012 return -ENOMEM;
2013
2014 init_completion(&comp_pkt.host_event);
2015 pkt->completion_func = hv_pci_generic_compl;
2016 pkt->compl_ctxt = &comp_pkt;
2017 d0_entry = (struct pci_bus_d0_entry *)&pkt->message;
0c6045d8 2018 d0_entry->message_type.type = PCI_BUS_D0ENTRY;
4daace0d
JO
2019 d0_entry->mmio_base = hbus->mem_config->start;
2020
2021 ret = vmbus_sendpacket(hdev->channel, d0_entry, sizeof(*d0_entry),
2022 (unsigned long)pkt, VM_PKT_DATA_INBAND,
2023 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
2024 if (ret)
2025 goto exit;
2026
2027 wait_for_completion(&comp_pkt.host_event);
2028
2029 if (comp_pkt.completion_status < 0) {
2030 dev_err(&hdev->device,
2031 "PCI Pass-through VSP failed D0 Entry with status %x\n",
2032 comp_pkt.completion_status);
2033 ret = -EPROTO;
2034 goto exit;
2035 }
2036
2037 ret = 0;
2038
2039exit:
2040 kfree(pkt);
2041 return ret;
2042}
2043
2044/**
2045 * hv_pci_query_relations() - Ask host to send list of child
2046 * devices
2047 * @hdev: VMBus's tracking struct for this root PCI bus
2048 *
2049 * Return: 0 on success, -errno on failure
2050 */
2051static int hv_pci_query_relations(struct hv_device *hdev)
2052{
2053 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2054 struct pci_message message;
2055 struct completion comp;
2056 int ret;
2057
2058 /* Ask the host to send along the list of child devices */
2059 init_completion(&comp);
2060 if (cmpxchg(&hbus->survey_event, NULL, &comp))
2061 return -ENOTEMPTY;
2062
2063 memset(&message, 0, sizeof(message));
0c6045d8 2064 message.type = PCI_QUERY_BUS_RELATIONS;
4daace0d
JO
2065
2066 ret = vmbus_sendpacket(hdev->channel, &message, sizeof(message),
2067 0, VM_PKT_DATA_INBAND, 0);
2068 if (ret)
2069 return ret;
2070
2071 wait_for_completion(&comp);
2072 return 0;
2073}
2074
2075/**
2076 * hv_send_resources_allocated() - Report local resource choices
2077 * @hdev: VMBus's tracking struct for this root PCI bus
2078 *
2079 * The host OS is expecting to be sent a request as a message
2080 * which contains all the resources that the device will use.
2081 * The response contains those same resources, "translated"
2082 * which is to say, the values which should be used by the
2083 * hardware, when it delivers an interrupt. (MMIO resources are
2084 * used in local terms.) This is nice for Windows, and lines up
2085 * with the FDO/PDO split, which doesn't exist in Linux. Linux
2086 * is deeply expecting to scan an emulated PCI configuration
2087 * space. So this message is sent here only to drive the state
2088 * machine on the host forward.
2089 *
2090 * Return: 0 on success, -errno on failure
2091 */
2092static int hv_send_resources_allocated(struct hv_device *hdev)
2093{
2094 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2095 struct pci_resources_assigned *res_assigned;
2096 struct hv_pci_compl comp_pkt;
2097 struct hv_pci_dev *hpdev;
2098 struct pci_packet *pkt;
2099 u32 wslot;
2100 int ret;
2101
2102 pkt = kmalloc(sizeof(*pkt) + sizeof(*res_assigned), GFP_KERNEL);
2103 if (!pkt)
2104 return -ENOMEM;
2105
2106 ret = 0;
2107
2108 for (wslot = 0; wslot < 256; wslot++) {
2109 hpdev = get_pcichild_wslot(hbus, wslot);
2110 if (!hpdev)
2111 continue;
2112
2113 memset(pkt, 0, sizeof(*pkt) + sizeof(*res_assigned));
2114 init_completion(&comp_pkt.host_event);
2115 pkt->completion_func = hv_pci_generic_compl;
2116 pkt->compl_ctxt = &comp_pkt;
4daace0d 2117 res_assigned = (struct pci_resources_assigned *)&pkt->message;
0c6045d8 2118 res_assigned->message_type.type = PCI_RESOURCES_ASSIGNED;
4daace0d
JO
2119 res_assigned->wslot.slot = hpdev->desc.win_slot.slot;
2120
2121 put_pcichild(hpdev, hv_pcidev_ref_by_slot);
2122
2123 ret = vmbus_sendpacket(
2124 hdev->channel, &pkt->message,
2125 sizeof(*res_assigned),
2126 (unsigned long)pkt,
2127 VM_PKT_DATA_INBAND,
2128 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
2129 if (ret)
2130 break;
2131
2132 wait_for_completion(&comp_pkt.host_event);
2133
2134 if (comp_pkt.completion_status < 0) {
2135 ret = -EPROTO;
2136 dev_err(&hdev->device,
2137 "resource allocated returned 0x%x",
2138 comp_pkt.completion_status);
2139 break;
2140 }
2141 }
2142
2143 kfree(pkt);
2144 return ret;
2145}
2146
2147/**
2148 * hv_send_resources_released() - Report local resources
2149 * released
2150 * @hdev: VMBus's tracking struct for this root PCI bus
2151 *
2152 * Return: 0 on success, -errno on failure
2153 */
2154static int hv_send_resources_released(struct hv_device *hdev)
2155{
2156 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2157 struct pci_child_message pkt;
2158 struct hv_pci_dev *hpdev;
2159 u32 wslot;
2160 int ret;
2161
2162 for (wslot = 0; wslot < 256; wslot++) {
2163 hpdev = get_pcichild_wslot(hbus, wslot);
2164 if (!hpdev)
2165 continue;
2166
2167 memset(&pkt, 0, sizeof(pkt));
0c6045d8 2168 pkt.message_type.type = PCI_RESOURCES_RELEASED;
4daace0d
JO
2169 pkt.wslot.slot = hpdev->desc.win_slot.slot;
2170
2171 put_pcichild(hpdev, hv_pcidev_ref_by_slot);
2172
2173 ret = vmbus_sendpacket(hdev->channel, &pkt, sizeof(pkt), 0,
2174 VM_PKT_DATA_INBAND, 0);
2175 if (ret)
2176 return ret;
2177 }
2178
2179 return 0;
2180}
2181
2182static void get_hvpcibus(struct hv_pcibus_device *hbus)
2183{
2184 atomic_inc(&hbus->remove_lock);
2185}
2186
2187static void put_hvpcibus(struct hv_pcibus_device *hbus)
2188{
2189 if (atomic_dec_and_test(&hbus->remove_lock))
2190 complete(&hbus->remove_event);
2191}
2192
2193/**
2194 * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
2195 * @hdev: VMBus's tracking struct for this root PCI bus
2196 * @dev_id: Identifies the device itself
2197 *
2198 * Return: 0 on success, -errno on failure
2199 */
2200static int hv_pci_probe(struct hv_device *hdev,
2201 const struct hv_vmbus_device_id *dev_id)
2202{
2203 struct hv_pcibus_device *hbus;
2204 int ret;
2205
2206 hbus = kzalloc(sizeof(*hbus), GFP_KERNEL);
2207 if (!hbus)
2208 return -ENOMEM;
d3a78d8b 2209 hbus->state = hv_pcibus_init;
4daace0d
JO
2210
2211 /*
2212 * The PCI bus "domain" is what is called "segment" in ACPI and
2213 * other specs. Pull it from the instance ID, to get something
2214 * unique. Bytes 8 and 9 are what is used in Windows guests, so
2215 * do the same thing for consistency. Note that, since this code
2216 * only runs in a Hyper-V VM, Hyper-V can (and does) guarantee
2217 * that (1) the only domain in use for something that looks like
2218 * a physical PCI bus (which is actually emulated by the
2219 * hypervisor) is domain 0 and (2) there will be no overlap
2220 * between domains derived from these instance IDs in the same
2221 * VM.
2222 */
2223 hbus->sysdata.domain = hdev->dev_instance.b[9] |
2224 hdev->dev_instance.b[8] << 8;
2225
2226 hbus->hdev = hdev;
2227 atomic_inc(&hbus->remove_lock);
2228 INIT_LIST_HEAD(&hbus->children);
2229 INIT_LIST_HEAD(&hbus->dr_list);
2230 INIT_LIST_HEAD(&hbus->resources_for_children);
2231 spin_lock_init(&hbus->config_lock);
2232 spin_lock_init(&hbus->device_list_lock);
0de8ce3e 2233 spin_lock_init(&hbus->retarget_msi_interrupt_lock);
4daace0d
JO
2234 sema_init(&hbus->enum_sem, 1);
2235 init_completion(&hbus->remove_event);
2236
2237 ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
2238 hv_pci_onchannelcallback, hbus);
2239 if (ret)
2240 goto free_bus;
2241
2242 hv_set_drvdata(hdev, hbus);
2243
2244 ret = hv_pci_protocol_negotiation(hdev);
2245 if (ret)
2246 goto close;
2247
2248 ret = hv_allocate_config_window(hbus);
2249 if (ret)
2250 goto close;
2251
2252 hbus->cfg_addr = ioremap(hbus->mem_config->start,
2253 PCI_CONFIG_MMIO_LENGTH);
2254 if (!hbus->cfg_addr) {
2255 dev_err(&hdev->device,
2256 "Unable to map a virtual address for config space\n");
2257 ret = -ENOMEM;
2258 goto free_config;
2259 }
2260
2261 hbus->sysdata.fwnode = irq_domain_alloc_fwnode(hbus);
2262 if (!hbus->sysdata.fwnode) {
2263 ret = -ENOMEM;
2264 goto unmap;
2265 }
2266
2267 ret = hv_pcie_init_irq_domain(hbus);
2268 if (ret)
2269 goto free_fwnode;
2270
2271 ret = hv_pci_query_relations(hdev);
2272 if (ret)
2273 goto free_irq_domain;
2274
2275 ret = hv_pci_enter_d0(hdev);
2276 if (ret)
2277 goto free_irq_domain;
2278
2279 ret = hv_pci_allocate_bridge_windows(hbus);
2280 if (ret)
2281 goto free_irq_domain;
2282
2283 ret = hv_send_resources_allocated(hdev);
2284 if (ret)
2285 goto free_windows;
2286
2287 prepopulate_bars(hbus);
2288
2289 hbus->state = hv_pcibus_probed;
2290
2291 ret = create_root_hv_pci_bus(hbus);
2292 if (ret)
2293 goto free_windows;
2294
2295 return 0;
2296
2297free_windows:
2298 hv_pci_free_bridge_windows(hbus);
2299free_irq_domain:
2300 irq_domain_remove(hbus->irq_domain);
2301free_fwnode:
2302 irq_domain_free_fwnode(hbus->sysdata.fwnode);
2303unmap:
2304 iounmap(hbus->cfg_addr);
2305free_config:
2306 hv_free_config_window(hbus);
2307close:
2308 vmbus_close(hdev->channel);
2309free_bus:
2310 kfree(hbus);
2311 return ret;
2312}
2313
17978524 2314static void hv_pci_bus_exit(struct hv_device *hdev)
4daace0d 2315{
17978524
DC
2316 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2317 struct {
4daace0d 2318 struct pci_packet teardown_packet;
17978524 2319 u8 buffer[sizeof(struct pci_message)];
4daace0d
JO
2320 } pkt;
2321 struct pci_bus_relations relations;
2322 struct hv_pci_compl comp_pkt;
17978524 2323 int ret;
4daace0d 2324
17978524
DC
2325 /*
2326 * After the host sends the RESCIND_CHANNEL message, it doesn't
2327 * access the per-channel ringbuffer any longer.
2328 */
2329 if (hdev->channel->rescind)
2330 return;
2331
2332 /* Delete any children which might still exist. */
2333 memset(&relations, 0, sizeof(relations));
2334 hv_pci_devices_present(hbus, &relations);
2335
2336 ret = hv_send_resources_released(hdev);
2337 if (ret)
2338 dev_err(&hdev->device,
2339 "Couldn't send resources released packet(s)\n");
4daace0d 2340
4daace0d
JO
2341 memset(&pkt.teardown_packet, 0, sizeof(pkt.teardown_packet));
2342 init_completion(&comp_pkt.host_event);
2343 pkt.teardown_packet.completion_func = hv_pci_generic_compl;
2344 pkt.teardown_packet.compl_ctxt = &comp_pkt;
0c6045d8 2345 pkt.teardown_packet.message[0].type = PCI_BUS_D0EXIT;
4daace0d
JO
2346
2347 ret = vmbus_sendpacket(hdev->channel, &pkt.teardown_packet.message,
2348 sizeof(struct pci_message),
2349 (unsigned long)&pkt.teardown_packet,
2350 VM_PKT_DATA_INBAND,
2351 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
2352 if (!ret)
2353 wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ);
17978524
DC
2354}
2355
2356/**
2357 * hv_pci_remove() - Remove routine for this VMBus channel
2358 * @hdev: VMBus's tracking struct for this root PCI bus
2359 *
2360 * Return: 0 on success, -errno on failure
2361 */
2362static int hv_pci_remove(struct hv_device *hdev)
2363{
2364 struct hv_pcibus_device *hbus;
4daace0d 2365
17978524 2366 hbus = hv_get_drvdata(hdev);
4daace0d
JO
2367 if (hbus->state == hv_pcibus_installed) {
2368 /* Remove the bus from PCI's point of view. */
2369 pci_lock_rescan_remove();
2370 pci_stop_root_bus(hbus->pci_bus);
2371 pci_remove_root_bus(hbus->pci_bus);
2372 pci_unlock_rescan_remove();
d3a78d8b 2373 hbus->state = hv_pcibus_removed;
4daace0d
JO
2374 }
2375
17978524 2376 hv_pci_bus_exit(hdev);
deb22e5c 2377
4daace0d
JO
2378 vmbus_close(hdev->channel);
2379
4daace0d
JO
2380 iounmap(hbus->cfg_addr);
2381 hv_free_config_window(hbus);
2382 pci_free_resource_list(&hbus->resources_for_children);
2383 hv_pci_free_bridge_windows(hbus);
2384 irq_domain_remove(hbus->irq_domain);
2385 irq_domain_free_fwnode(hbus->sysdata.fwnode);
2386 put_hvpcibus(hbus);
2387 wait_for_completion(&hbus->remove_event);
2388 kfree(hbus);
2389 return 0;
2390}
2391
2392static const struct hv_vmbus_device_id hv_pci_id_table[] = {
2393 /* PCI Pass-through Class ID */
2394 /* 44C4F61D-4444-4400-9D52-802E27EDE19F */
2395 { HV_PCIE_GUID, },
2396 { },
2397};
2398
2399MODULE_DEVICE_TABLE(vmbus, hv_pci_id_table);
2400
2401static struct hv_driver hv_pci_drv = {
2402 .name = "hv_pci",
2403 .id_table = hv_pci_id_table,
2404 .probe = hv_pci_probe,
2405 .remove = hv_pci_remove,
2406};
2407
2408static void __exit exit_hv_pci_drv(void)
2409{
2410 vmbus_driver_unregister(&hv_pci_drv);
2411}
2412
2413static int __init init_hv_pci_drv(void)
2414{
2415 return vmbus_driver_register(&hv_pci_drv);
2416}
2417
2418module_init(init_hv_pci_drv);
2419module_exit(exit_hv_pci_drv);
2420
2421MODULE_DESCRIPTION("Hyper-V PCI");
2422MODULE_LICENSE("GPL v2");