1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2014 Intel Corporation
7 #include <linux/pci_regs.h>
8 #include <sys/eventfd.h>
9 #include <sys/socket.h>
10 #include <sys/ioctl.h>
16 #include <rte_bus_pci.h>
17 #include <rte_eal_memconfig.h>
18 #include <rte_malloc.h>
22 #include <rte_spinlock.h>
24 #include "eal_filesystem.h"
31 * PCI probing under linux (VFIO version)
33 * This code tries to determine if the PCI device is bound to VFIO driver,
34 * and initialize it (map BARs, set up interrupts) if that's the case.
36 * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y".
42 #define PAGE_SIZE (sysconf(_SC_PAGESIZE))
44 #define PAGE_MASK (~(PAGE_SIZE - 1))
46 static struct rte_tailq_elem rte_vfio_tailq
= {
47 .name
= "VFIO_RESOURCE_LIST",
49 EAL_REGISTER_TAILQ(rte_vfio_tailq
)
52 pci_vfio_read_config(const struct rte_intr_handle
*intr_handle
,
53 void *buf
, size_t len
, off_t offs
)
55 return pread64(intr_handle
->vfio_dev_fd
, buf
, len
,
56 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX
) + offs
);
60 pci_vfio_write_config(const struct rte_intr_handle
*intr_handle
,
61 const void *buf
, size_t len
, off_t offs
)
63 return pwrite64(intr_handle
->vfio_dev_fd
, buf
, len
,
64 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX
) + offs
);
67 /* get PCI BAR number where MSI-X interrupts are */
69 pci_vfio_get_msix_bar(int fd
, struct pci_msix_table
*msix_table
)
74 uint8_t cap_id
, cap_offset
;
76 /* read PCI capability pointer from config space */
77 ret
= pread64(fd
, ®
, sizeof(reg
),
78 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX
) +
80 if (ret
!= sizeof(reg
)) {
81 RTE_LOG(ERR
, EAL
, "Cannot read capability pointer from PCI "
86 /* we need first byte */
87 cap_offset
= reg
& 0xFF;
91 /* read PCI capability ID */
92 ret
= pread64(fd
, ®
, sizeof(reg
),
93 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX
) +
95 if (ret
!= sizeof(reg
)) {
96 RTE_LOG(ERR
, EAL
, "Cannot read capability ID from PCI "
101 /* we need first byte */
104 /* if we haven't reached MSI-X, check next capability */
105 if (cap_id
!= PCI_CAP_ID_MSIX
) {
106 ret
= pread64(fd
, ®
, sizeof(reg
),
107 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX
) +
109 if (ret
!= sizeof(reg
)) {
110 RTE_LOG(ERR
, EAL
, "Cannot read capability pointer from PCI "
115 /* we need second byte */
116 cap_offset
= (reg
& 0xFF00) >> 8;
120 /* else, read table offset */
122 /* table offset resides in the next 4 bytes */
123 ret
= pread64(fd
, ®
, sizeof(reg
),
124 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX
) +
126 if (ret
!= sizeof(reg
)) {
127 RTE_LOG(ERR
, EAL
, "Cannot read table offset from PCI config "
132 ret
= pread64(fd
, &flags
, sizeof(flags
),
133 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX
) +
135 if (ret
!= sizeof(flags
)) {
136 RTE_LOG(ERR
, EAL
, "Cannot read table flags from PCI config "
141 msix_table
->bar_index
= reg
& RTE_PCI_MSIX_TABLE_BIR
;
142 msix_table
->offset
= reg
& RTE_PCI_MSIX_TABLE_OFFSET
;
144 16 * (1 + (flags
& RTE_PCI_MSIX_FLAGS_QSIZE
));
152 /* set PCI bus mastering */
154 pci_vfio_set_bus_master(int dev_fd
, bool op
)
159 ret
= pread64(dev_fd
, ®
, sizeof(reg
),
160 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX
) +
162 if (ret
!= sizeof(reg
)) {
163 RTE_LOG(ERR
, EAL
, "Cannot read command from PCI config space!\n");
168 /* set the master bit */
169 reg
|= PCI_COMMAND_MASTER
;
171 reg
&= ~(PCI_COMMAND_MASTER
);
173 ret
= pwrite64(dev_fd
, ®
, sizeof(reg
),
174 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX
) +
177 if (ret
!= sizeof(reg
)) {
178 RTE_LOG(ERR
, EAL
, "Cannot write command to PCI config space!\n");
185 /* set up interrupt support (but not enable interrupts) */
187 pci_vfio_setup_interrupts(struct rte_pci_device
*dev
, int vfio_dev_fd
)
189 int i
, ret
, intr_idx
;
190 enum rte_intr_mode intr_mode
;
192 /* default to invalid index */
193 intr_idx
= VFIO_PCI_NUM_IRQS
;
195 /* Get default / configured intr_mode */
196 intr_mode
= rte_eal_vfio_intr_mode();
198 /* get interrupt type from internal config (MSI-X by default, can be
199 * overridden from the command line
202 case RTE_INTR_MODE_MSIX
:
203 intr_idx
= VFIO_PCI_MSIX_IRQ_INDEX
;
205 case RTE_INTR_MODE_MSI
:
206 intr_idx
= VFIO_PCI_MSI_IRQ_INDEX
;
208 case RTE_INTR_MODE_LEGACY
:
209 intr_idx
= VFIO_PCI_INTX_IRQ_INDEX
;
211 /* don't do anything if we want to automatically determine interrupt type */
212 case RTE_INTR_MODE_NONE
:
215 RTE_LOG(ERR
, EAL
, " unknown default interrupt type!\n");
219 /* start from MSI-X interrupt type */
220 for (i
= VFIO_PCI_MSIX_IRQ_INDEX
; i
>= 0; i
--) {
221 struct vfio_irq_info irq
= { .argsz
= sizeof(irq
) };
224 /* skip interrupt modes we don't want */
225 if (intr_mode
!= RTE_INTR_MODE_NONE
&&
231 ret
= ioctl(vfio_dev_fd
, VFIO_DEVICE_GET_IRQ_INFO
, &irq
);
233 RTE_LOG(ERR
, EAL
, " cannot get IRQ info, "
234 "error %i (%s)\n", errno
, strerror(errno
));
238 /* if this vector cannot be used with eventfd, fail if we explicitly
239 * specified interrupt type, otherwise continue */
240 if ((irq
.flags
& VFIO_IRQ_INFO_EVENTFD
) == 0) {
241 if (intr_mode
!= RTE_INTR_MODE_NONE
) {
243 " interrupt vector does not support eventfd!\n");
249 /* set up an eventfd for interrupts */
250 fd
= eventfd(0, EFD_NONBLOCK
| EFD_CLOEXEC
);
252 RTE_LOG(ERR
, EAL
, " cannot set up eventfd, "
253 "error %i (%s)\n", errno
, strerror(errno
));
257 dev
->intr_handle
.fd
= fd
;
258 dev
->intr_handle
.vfio_dev_fd
= vfio_dev_fd
;
261 case VFIO_PCI_MSIX_IRQ_INDEX
:
262 intr_mode
= RTE_INTR_MODE_MSIX
;
263 dev
->intr_handle
.type
= RTE_INTR_HANDLE_VFIO_MSIX
;
265 case VFIO_PCI_MSI_IRQ_INDEX
:
266 intr_mode
= RTE_INTR_MODE_MSI
;
267 dev
->intr_handle
.type
= RTE_INTR_HANDLE_VFIO_MSI
;
269 case VFIO_PCI_INTX_IRQ_INDEX
:
270 intr_mode
= RTE_INTR_MODE_LEGACY
;
271 dev
->intr_handle
.type
= RTE_INTR_HANDLE_VFIO_LEGACY
;
274 RTE_LOG(ERR
, EAL
, " unknown interrupt type!\n");
281 /* if we're here, we haven't found a suitable interrupt vector */
285 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
287 * Spinlock for device hot-unplug failure handling.
288 * If it tries to access bus or device, such as handle sigbus on bus
289 * or handle memory failure for device, just need to use this lock.
290 * It could protect the bus and the device to avoid race condition.
292 static rte_spinlock_t failure_handle_lock
= RTE_SPINLOCK_INITIALIZER
;
295 pci_vfio_req_handler(void *param
)
299 struct rte_device
*device
= (struct rte_device
*)param
;
301 rte_spinlock_lock(&failure_handle_lock
);
302 bus
= rte_bus_find_by_device(device
);
304 RTE_LOG(ERR
, EAL
, "Cannot find bus for device (%s)\n",
310 * vfio kernel module request user space to release allocated
311 * resources before device be deleted in kernel, so it can directly
312 * call the vfio bus hot-unplug handler to process it.
314 ret
= bus
->hot_unplug_handler(device
);
317 "Can not handle hot-unplug for device (%s)\n",
320 rte_spinlock_unlock(&failure_handle_lock
);
323 /* enable notifier (only enable req now) */
325 pci_vfio_enable_notifier(struct rte_pci_device
*dev
, int vfio_dev_fd
)
330 /* set up an eventfd for req notifier */
331 fd
= eventfd(0, EFD_NONBLOCK
| EFD_CLOEXEC
);
333 RTE_LOG(ERR
, EAL
, "Cannot set up eventfd, error %i (%s)\n",
334 errno
, strerror(errno
));
338 dev
->vfio_req_intr_handle
.fd
= fd
;
339 dev
->vfio_req_intr_handle
.type
= RTE_INTR_HANDLE_VFIO_REQ
;
340 dev
->vfio_req_intr_handle
.vfio_dev_fd
= vfio_dev_fd
;
342 ret
= rte_intr_callback_register(&dev
->vfio_req_intr_handle
,
343 pci_vfio_req_handler
,
344 (void *)&dev
->device
);
346 RTE_LOG(ERR
, EAL
, "Fail to register req notifier handler.\n");
350 ret
= rte_intr_enable(&dev
->vfio_req_intr_handle
);
352 RTE_LOG(ERR
, EAL
, "Fail to enable req notifier.\n");
353 ret
= rte_intr_callback_unregister(&dev
->vfio_req_intr_handle
,
354 pci_vfio_req_handler
,
355 (void *)&dev
->device
);
358 "Fail to unregister req notifier handler.\n");
366 dev
->vfio_req_intr_handle
.fd
= -1;
367 dev
->vfio_req_intr_handle
.type
= RTE_INTR_HANDLE_UNKNOWN
;
368 dev
->vfio_req_intr_handle
.vfio_dev_fd
= -1;
373 /* disable notifier (only disable req now) */
375 pci_vfio_disable_notifier(struct rte_pci_device
*dev
)
379 ret
= rte_intr_disable(&dev
->vfio_req_intr_handle
);
381 RTE_LOG(ERR
, EAL
, "fail to disable req notifier.\n");
385 ret
= rte_intr_callback_unregister(&dev
->vfio_req_intr_handle
,
386 pci_vfio_req_handler
,
387 (void *)&dev
->device
);
390 "fail to unregister req notifier handler.\n");
394 close(dev
->vfio_req_intr_handle
.fd
);
396 dev
->vfio_req_intr_handle
.fd
= -1;
397 dev
->vfio_req_intr_handle
.type
= RTE_INTR_HANDLE_UNKNOWN
;
398 dev
->vfio_req_intr_handle
.vfio_dev_fd
= -1;
405 pci_vfio_is_ioport_bar(int vfio_dev_fd
, int bar_index
)
410 ret
= pread64(vfio_dev_fd
, &ioport_bar
, sizeof(ioport_bar
),
411 VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX
)
412 + PCI_BASE_ADDRESS_0
+ bar_index
*4);
413 if (ret
!= sizeof(ioport_bar
)) {
414 RTE_LOG(ERR
, EAL
, "Cannot read command (%x) from config space!\n",
415 PCI_BASE_ADDRESS_0
+ bar_index
*4);
419 return (ioport_bar
& PCI_BASE_ADDRESS_SPACE_IO
) != 0;
423 pci_rte_vfio_setup_device(struct rte_pci_device
*dev
, int vfio_dev_fd
)
425 if (pci_vfio_setup_interrupts(dev
, vfio_dev_fd
) != 0) {
426 RTE_LOG(ERR
, EAL
, "Error setting up interrupts!\n");
430 /* set bus mastering for the device */
431 if (pci_vfio_set_bus_master(vfio_dev_fd
, true)) {
432 RTE_LOG(ERR
, EAL
, "Cannot set up bus mastering!\n");
437 * Reset the device. If the device is not capable of resetting,
438 * then it updates errno as EINVAL.
440 if (ioctl(vfio_dev_fd
, VFIO_DEVICE_RESET
) && errno
!= EINVAL
) {
441 RTE_LOG(ERR
, EAL
, "Unable to reset device! Error: %d (%s)\n",
442 errno
, strerror(errno
));
450 pci_vfio_mmap_bar(int vfio_dev_fd
, struct mapped_pci_resource
*vfio_res
,
451 int bar_index
, int additional_flags
)
454 unsigned long offset
, size
;
457 struct pci_msix_table
*msix_table
= &vfio_res
->msix_table
;
458 struct pci_map
*bar
= &vfio_res
->maps
[bar_index
];
460 if (bar
->size
== 0) {
461 RTE_LOG(DEBUG
, EAL
, "Bar size is 0, skip BAR%d\n", bar_index
);
465 if (msix_table
->bar_index
== bar_index
) {
467 * VFIO will not let us map the MSI-X table,
468 * but we can map around it.
470 uint32_t table_start
= msix_table
->offset
;
471 uint32_t table_end
= table_start
+ msix_table
->size
;
472 table_end
= RTE_ALIGN(table_end
, PAGE_SIZE
);
473 table_start
= RTE_ALIGN_FLOOR(table_start
, PAGE_SIZE
);
475 /* If page-aligned start of MSI-X table is less than the
476 * actual MSI-X table start address, reassign to the actual
479 if (table_start
< msix_table
->offset
)
480 table_start
= msix_table
->offset
;
482 if (table_start
== 0 && table_end
>= bar
->size
) {
483 /* Cannot map this BAR */
484 RTE_LOG(DEBUG
, EAL
, "Skipping BAR%d\n", bar_index
);
490 memreg
[0].offset
= bar
->offset
;
491 memreg
[0].size
= table_start
;
492 if (bar
->size
< table_end
) {
494 * If MSI-X table end is beyond BAR end, don't attempt
495 * to perform second mapping.
497 memreg
[1].offset
= 0;
500 memreg
[1].offset
= bar
->offset
+ table_end
;
501 memreg
[1].size
= bar
->size
- table_end
;
505 "Trying to map BAR%d that contains the MSI-X "
506 "table. Trying offsets: "
507 "0x%04lx:0x%04lx, 0x%04lx:0x%04lx\n", bar_index
,
508 memreg
[0].offset
, memreg
[0].size
,
509 memreg
[1].offset
, memreg
[1].size
);
511 memreg
[0].offset
= bar
->offset
;
512 memreg
[0].size
= bar
->size
;
515 /* reserve the address using an inaccessible mapping */
516 bar_addr
= mmap(bar
->addr
, bar
->size
, 0, MAP_PRIVATE
|
517 MAP_ANONYMOUS
| additional_flags
, -1, 0);
518 if (bar_addr
!= MAP_FAILED
) {
519 void *map_addr
= NULL
;
520 if (memreg
[0].size
) {
521 /* actual map of first part */
522 map_addr
= pci_map_resource(bar_addr
, vfio_dev_fd
,
528 /* if there's a second part, try to map it */
529 if (map_addr
!= MAP_FAILED
530 && memreg
[1].offset
&& memreg
[1].size
) {
531 void *second_addr
= RTE_PTR_ADD(bar_addr
,
533 (uintptr_t)bar
->offset
);
534 map_addr
= pci_map_resource(second_addr
,
541 if (map_addr
== MAP_FAILED
|| !map_addr
) {
542 munmap(bar_addr
, bar
->size
);
543 bar_addr
= MAP_FAILED
;
544 RTE_LOG(ERR
, EAL
, "Failed to map pci BAR%d\n",
550 "Failed to create inaccessible mapping for BAR%d\n",
555 bar
->addr
= bar_addr
;
560 * region info may contain capability headers, so we need to keep reallocating
561 * the memory until we match allocated memory size with argsz.
564 pci_vfio_get_region_info(int vfio_dev_fd
, struct vfio_region_info
**info
,
567 struct vfio_region_info
*ri
;
568 size_t argsz
= sizeof(*ri
);
571 ri
= malloc(sizeof(*ri
));
573 RTE_LOG(ERR
, EAL
, "Cannot allocate memory for region info\n");
577 memset(ri
, 0, argsz
);
581 ret
= ioctl(vfio_dev_fd
, VFIO_DEVICE_GET_REGION_INFO
, ri
);
586 if (ri
->argsz
!= argsz
) {
587 struct vfio_region_info
*tmp
;
590 tmp
= realloc(ri
, argsz
);
593 /* realloc failed but the ri is still there */
595 RTE_LOG(ERR
, EAL
, "Cannot reallocate memory for region info\n");
606 static struct vfio_info_cap_header
*
607 pci_vfio_info_cap(struct vfio_region_info
*info
, int cap
)
609 struct vfio_info_cap_header
*h
;
612 if ((info
->flags
& RTE_VFIO_INFO_FLAG_CAPS
) == 0) {
613 /* VFIO info does not advertise capabilities */
617 offset
= VFIO_CAP_OFFSET(info
);
618 while (offset
!= 0) {
619 h
= RTE_PTR_ADD(info
, offset
);
628 pci_vfio_msix_is_mappable(int vfio_dev_fd
, int msix_region
)
630 struct vfio_region_info
*info
;
633 ret
= pci_vfio_get_region_info(vfio_dev_fd
, &info
, msix_region
);
637 ret
= pci_vfio_info_cap(info
, RTE_VFIO_CAP_MSIX_MAPPABLE
) != NULL
;
647 pci_vfio_map_resource_primary(struct rte_pci_device
*dev
)
649 struct vfio_device_info device_info
= { .argsz
= sizeof(device_info
) };
650 char pci_addr
[PATH_MAX
] = {0};
652 struct rte_pci_addr
*loc
= &dev
->addr
;
654 struct mapped_pci_resource
*vfio_res
= NULL
;
655 struct mapped_pci_res_list
*vfio_res_list
=
656 RTE_TAILQ_CAST(rte_vfio_tailq
.head
, mapped_pci_res_list
);
658 struct pci_map
*maps
;
660 dev
->intr_handle
.fd
= -1;
661 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
662 dev
->vfio_req_intr_handle
.fd
= -1;
665 /* store PCI address string */
666 snprintf(pci_addr
, sizeof(pci_addr
), PCI_PRI_FMT
,
667 loc
->domain
, loc
->bus
, loc
->devid
, loc
->function
);
669 ret
= rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr
,
670 &vfio_dev_fd
, &device_info
);
674 /* allocate vfio_res and get region info */
675 vfio_res
= rte_zmalloc("VFIO_RES", sizeof(*vfio_res
), 0);
676 if (vfio_res
== NULL
) {
678 "%s(): cannot store vfio mmap details\n", __func__
);
679 goto err_vfio_dev_fd
;
681 memcpy(&vfio_res
->pci_addr
, &dev
->addr
, sizeof(vfio_res
->pci_addr
));
683 /* get number of registers (up to BAR5) */
684 vfio_res
->nb_maps
= RTE_MIN((int) device_info
.num_regions
,
685 VFIO_PCI_BAR5_REGION_INDEX
+ 1);
688 maps
= vfio_res
->maps
;
690 vfio_res
->msix_table
.bar_index
= -1;
691 /* get MSI-X BAR, if any (we have to know where it is because we can't
692 * easily mmap it when using VFIO)
694 ret
= pci_vfio_get_msix_bar(vfio_dev_fd
, &vfio_res
->msix_table
);
696 RTE_LOG(ERR
, EAL
, " %s cannot get MSI-X BAR number!\n",
700 /* if we found our MSI-X BAR region, check if we can mmap it */
701 if (vfio_res
->msix_table
.bar_index
!= -1) {
702 int ret
= pci_vfio_msix_is_mappable(vfio_dev_fd
,
703 vfio_res
->msix_table
.bar_index
);
705 RTE_LOG(ERR
, EAL
, "Couldn't check if MSI-X BAR is mappable\n");
707 } else if (ret
!= 0) {
708 /* we can map it, so we don't care where it is */
709 RTE_LOG(DEBUG
, EAL
, "VFIO reports MSI-X BAR as mappable\n");
710 vfio_res
->msix_table
.bar_index
= -1;
714 for (i
= 0; i
< (int) vfio_res
->nb_maps
; i
++) {
715 struct vfio_region_info
*reg
= NULL
;
718 ret
= pci_vfio_get_region_info(vfio_dev_fd
, ®
, i
);
720 RTE_LOG(ERR
, EAL
, " %s cannot get device region info "
721 "error %i (%s)\n", pci_addr
, errno
,
726 /* chk for io port region */
727 ret
= pci_vfio_is_ioport_bar(vfio_dev_fd
, i
);
732 RTE_LOG(INFO
, EAL
, "Ignore mapping IO port bar(%d)\n",
738 /* skip non-mmapable BARs */
739 if ((reg
->flags
& VFIO_REGION_INFO_FLAG_MMAP
) == 0) {
744 /* try mapping somewhere close to the end of hugepages */
745 if (pci_map_addr
== NULL
)
746 pci_map_addr
= pci_find_max_end_va();
748 bar_addr
= pci_map_addr
;
749 pci_map_addr
= RTE_PTR_ADD(bar_addr
, (size_t) reg
->size
);
751 maps
[i
].addr
= bar_addr
;
752 maps
[i
].offset
= reg
->offset
;
753 maps
[i
].size
= reg
->size
;
754 maps
[i
].path
= NULL
; /* vfio doesn't have per-resource paths */
756 ret
= pci_vfio_mmap_bar(vfio_dev_fd
, vfio_res
, i
, 0);
758 RTE_LOG(ERR
, EAL
, " %s mapping BAR%i failed: %s\n",
759 pci_addr
, i
, strerror(errno
));
764 dev
->mem_resource
[i
].addr
= maps
[i
].addr
;
769 if (pci_rte_vfio_setup_device(dev
, vfio_dev_fd
) < 0) {
770 RTE_LOG(ERR
, EAL
, " %s setup device failed\n", pci_addr
);
774 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
775 if (pci_vfio_enable_notifier(dev
, vfio_dev_fd
) != 0) {
776 RTE_LOG(ERR
, EAL
, "Error setting up notifier!\n");
781 TAILQ_INSERT_TAIL(vfio_res_list
, vfio_res
, next
);
792 pci_vfio_map_resource_secondary(struct rte_pci_device
*dev
)
794 struct vfio_device_info device_info
= { .argsz
= sizeof(device_info
) };
795 char pci_addr
[PATH_MAX
] = {0};
797 struct rte_pci_addr
*loc
= &dev
->addr
;
799 struct mapped_pci_resource
*vfio_res
= NULL
;
800 struct mapped_pci_res_list
*vfio_res_list
=
801 RTE_TAILQ_CAST(rte_vfio_tailq
.head
, mapped_pci_res_list
);
803 struct pci_map
*maps
;
805 dev
->intr_handle
.fd
= -1;
806 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
807 dev
->vfio_req_intr_handle
.fd
= -1;
810 /* store PCI address string */
811 snprintf(pci_addr
, sizeof(pci_addr
), PCI_PRI_FMT
,
812 loc
->domain
, loc
->bus
, loc
->devid
, loc
->function
);
814 /* if we're in a secondary process, just find our tailq entry */
815 TAILQ_FOREACH(vfio_res
, vfio_res_list
, next
) {
816 if (rte_pci_addr_cmp(&vfio_res
->pci_addr
,
821 /* if we haven't found our tailq entry, something's wrong */
822 if (vfio_res
== NULL
) {
823 RTE_LOG(ERR
, EAL
, " %s cannot find TAILQ entry for PCI device!\n",
828 ret
= rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr
,
829 &vfio_dev_fd
, &device_info
);
834 maps
= vfio_res
->maps
;
836 for (i
= 0; i
< (int) vfio_res
->nb_maps
; i
++) {
837 ret
= pci_vfio_mmap_bar(vfio_dev_fd
, vfio_res
, i
, MAP_FIXED
);
839 RTE_LOG(ERR
, EAL
, " %s mapping BAR%i failed: %s\n",
840 pci_addr
, i
, strerror(errno
));
841 goto err_vfio_dev_fd
;
844 dev
->mem_resource
[i
].addr
= maps
[i
].addr
;
847 /* we need save vfio_dev_fd, so it can be used during release */
848 dev
->intr_handle
.vfio_dev_fd
= vfio_dev_fd
;
849 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
850 dev
->vfio_req_intr_handle
.vfio_dev_fd
= vfio_dev_fd
;
860 * map the PCI resources of a PCI device in virtual memory (VFIO version).
861 * primary and secondary processes follow almost exactly the same path
864 pci_vfio_map_resource(struct rte_pci_device
*dev
)
866 if (rte_eal_process_type() == RTE_PROC_PRIMARY
)
867 return pci_vfio_map_resource_primary(dev
);
869 return pci_vfio_map_resource_secondary(dev
);
872 static struct mapped_pci_resource
*
873 find_and_unmap_vfio_resource(struct mapped_pci_res_list
*vfio_res_list
,
874 struct rte_pci_device
*dev
,
875 const char *pci_addr
)
877 struct mapped_pci_resource
*vfio_res
= NULL
;
878 struct pci_map
*maps
;
882 TAILQ_FOREACH(vfio_res
, vfio_res_list
, next
) {
883 if (rte_pci_addr_cmp(&vfio_res
->pci_addr
, &dev
->addr
))
888 if (vfio_res
== NULL
)
891 RTE_LOG(INFO
, EAL
, "Releasing pci mapped resource for %s\n",
894 maps
= vfio_res
->maps
;
895 for (i
= 0; i
< (int) vfio_res
->nb_maps
; i
++) {
898 * We do not need to be aware of MSI-X table BAR mappings as
899 * when mapping. Just using current maps array is enough
902 RTE_LOG(INFO
, EAL
, "Calling pci_unmap_resource for %s at %p\n",
903 pci_addr
, maps
[i
].addr
);
904 pci_unmap_resource(maps
[i
].addr
, maps
[i
].size
);
912 pci_vfio_unmap_resource_primary(struct rte_pci_device
*dev
)
914 char pci_addr
[PATH_MAX
] = {0};
915 struct rte_pci_addr
*loc
= &dev
->addr
;
916 struct mapped_pci_resource
*vfio_res
= NULL
;
917 struct mapped_pci_res_list
*vfio_res_list
;
920 /* store PCI address string */
921 snprintf(pci_addr
, sizeof(pci_addr
), PCI_PRI_FMT
,
922 loc
->domain
, loc
->bus
, loc
->devid
, loc
->function
);
924 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
925 ret
= pci_vfio_disable_notifier(dev
);
927 RTE_LOG(ERR
, EAL
, "fail to disable req notifier.\n");
932 if (close(dev
->intr_handle
.fd
) < 0) {
933 RTE_LOG(INFO
, EAL
, "Error when closing eventfd file descriptor for %s\n",
938 if (pci_vfio_set_bus_master(dev
->intr_handle
.vfio_dev_fd
, false)) {
939 RTE_LOG(ERR
, EAL
, " %s cannot unset bus mastering for PCI device!\n",
944 ret
= rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr
,
945 dev
->intr_handle
.vfio_dev_fd
);
948 "%s(): cannot release device\n", __func__
);
953 RTE_TAILQ_CAST(rte_vfio_tailq
.head
, mapped_pci_res_list
);
954 vfio_res
= find_and_unmap_vfio_resource(vfio_res_list
, dev
, pci_addr
);
956 /* if we haven't found our tailq entry, something's wrong */
957 if (vfio_res
== NULL
) {
958 RTE_LOG(ERR
, EAL
, " %s cannot find TAILQ entry for PCI device!\n",
963 TAILQ_REMOVE(vfio_res_list
, vfio_res
, next
);
969 pci_vfio_unmap_resource_secondary(struct rte_pci_device
*dev
)
971 char pci_addr
[PATH_MAX
] = {0};
972 struct rte_pci_addr
*loc
= &dev
->addr
;
973 struct mapped_pci_resource
*vfio_res
= NULL
;
974 struct mapped_pci_res_list
*vfio_res_list
;
977 /* store PCI address string */
978 snprintf(pci_addr
, sizeof(pci_addr
), PCI_PRI_FMT
,
979 loc
->domain
, loc
->bus
, loc
->devid
, loc
->function
);
981 ret
= rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr
,
982 dev
->intr_handle
.vfio_dev_fd
);
985 "%s(): cannot release device\n", __func__
);
990 RTE_TAILQ_CAST(rte_vfio_tailq
.head
, mapped_pci_res_list
);
991 vfio_res
= find_and_unmap_vfio_resource(vfio_res_list
, dev
, pci_addr
);
993 /* if we haven't found our tailq entry, something's wrong */
994 if (vfio_res
== NULL
) {
995 RTE_LOG(ERR
, EAL
, " %s cannot find TAILQ entry for PCI device!\n",
1004 pci_vfio_unmap_resource(struct rte_pci_device
*dev
)
1006 if (rte_eal_process_type() == RTE_PROC_PRIMARY
)
1007 return pci_vfio_unmap_resource_primary(dev
);
1009 return pci_vfio_unmap_resource_secondary(dev
);
1013 pci_vfio_ioport_map(struct rte_pci_device
*dev
, int bar
,
1014 struct rte_pci_ioport
*p
)
1016 if (bar
< VFIO_PCI_BAR0_REGION_INDEX
||
1017 bar
> VFIO_PCI_BAR5_REGION_INDEX
) {
1018 RTE_LOG(ERR
, EAL
, "invalid bar (%d)!\n", bar
);
1023 p
->base
= VFIO_GET_REGION_ADDR(bar
);
1028 pci_vfio_ioport_read(struct rte_pci_ioport
*p
,
1029 void *data
, size_t len
, off_t offset
)
1031 const struct rte_intr_handle
*intr_handle
= &p
->dev
->intr_handle
;
1033 if (pread64(intr_handle
->vfio_dev_fd
, data
,
1034 len
, p
->base
+ offset
) <= 0)
1036 "Can't read from PCI bar (%" PRIu64
") : offset (%x)\n",
1037 VFIO_GET_REGION_IDX(p
->base
), (int)offset
);
1041 pci_vfio_ioport_write(struct rte_pci_ioport
*p
,
1042 const void *data
, size_t len
, off_t offset
)
1044 const struct rte_intr_handle
*intr_handle
= &p
->dev
->intr_handle
;
1046 if (pwrite64(intr_handle
->vfio_dev_fd
, data
,
1047 len
, p
->base
+ offset
) <= 0)
1049 "Can't write to PCI bar (%" PRIu64
") : offset (%x)\n",
1050 VFIO_GET_REGION_IDX(p
->base
), (int)offset
);
1054 pci_vfio_ioport_unmap(struct rte_pci_ioport
*p
)
1061 pci_vfio_is_enabled(void)
1063 return rte_vfio_is_enabled("vfio_pci");