]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | /* SPDX-License-Identifier: BSD-3-Clause |
2 | * Copyright(c) 2010-2014 Intel Corporation | |
3 | */ | |
4 | ||
5 | #include <string.h> | |
6 | #include <fcntl.h> | |
7 | #include <linux/pci_regs.h> | |
8 | #include <sys/eventfd.h> | |
9 | #include <sys/socket.h> | |
10 | #include <sys/ioctl.h> | |
11 | #include <sys/mman.h> | |
12 | #include <stdbool.h> | |
13 | ||
14 | #include <rte_log.h> | |
15 | #include <rte_pci.h> | |
16 | #include <rte_bus_pci.h> | |
11fdf7f2 TL |
17 | #include <rte_malloc.h> |
18 | #include <rte_vfio.h> | |
9f95a23c TL |
19 | #include <rte_eal.h> |
20 | #include <rte_bus.h> | |
21 | #include <rte_spinlock.h> | |
f67539c2 | 22 | #include <rte_tailq.h> |
11fdf7f2 TL |
23 | |
24 | #include "eal_filesystem.h" | |
25 | ||
26 | #include "pci_init.h" | |
27 | #include "private.h" | |
28 | ||
29 | /** | |
30 | * @file | |
31 | * PCI probing under linux (VFIO version) | |
32 | * | |
33 | * This code tries to determine if the PCI device is bound to VFIO driver, | |
34 | * and initialize it (map BARs, set up interrupts) if that's the case. | |
35 | * | |
36 | * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y". | |
37 | */ | |
38 | ||
39 | #ifdef VFIO_PRESENT | |
40 | ||
9f95a23c | 41 | #ifndef PAGE_SIZE |
11fdf7f2 | 42 | #define PAGE_SIZE (sysconf(_SC_PAGESIZE)) |
9f95a23c | 43 | #endif |
11fdf7f2 TL |
44 | #define PAGE_MASK (~(PAGE_SIZE - 1)) |
45 | ||
46 | static struct rte_tailq_elem rte_vfio_tailq = { | |
47 | .name = "VFIO_RESOURCE_LIST", | |
48 | }; | |
49 | EAL_REGISTER_TAILQ(rte_vfio_tailq) | |
50 | ||
51 | int | |
52 | pci_vfio_read_config(const struct rte_intr_handle *intr_handle, | |
53 | void *buf, size_t len, off_t offs) | |
54 | { | |
55 | return pread64(intr_handle->vfio_dev_fd, buf, len, | |
56 | VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs); | |
57 | } | |
58 | ||
59 | int | |
60 | pci_vfio_write_config(const struct rte_intr_handle *intr_handle, | |
61 | const void *buf, size_t len, off_t offs) | |
62 | { | |
63 | return pwrite64(intr_handle->vfio_dev_fd, buf, len, | |
64 | VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs); | |
65 | } | |
66 | ||
67 | /* get PCI BAR number where MSI-X interrupts are */ | |
68 | static int | |
69 | pci_vfio_get_msix_bar(int fd, struct pci_msix_table *msix_table) | |
70 | { | |
71 | int ret; | |
72 | uint32_t reg; | |
73 | uint16_t flags; | |
74 | uint8_t cap_id, cap_offset; | |
75 | ||
76 | /* read PCI capability pointer from config space */ | |
77 | ret = pread64(fd, ®, sizeof(reg), | |
78 | VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + | |
79 | PCI_CAPABILITY_LIST); | |
80 | if (ret != sizeof(reg)) { | |
81 | RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI " | |
82 | "config space!\n"); | |
83 | return -1; | |
84 | } | |
85 | ||
86 | /* we need first byte */ | |
87 | cap_offset = reg & 0xFF; | |
88 | ||
89 | while (cap_offset) { | |
90 | ||
91 | /* read PCI capability ID */ | |
92 | ret = pread64(fd, ®, sizeof(reg), | |
93 | VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + | |
94 | cap_offset); | |
95 | if (ret != sizeof(reg)) { | |
96 | RTE_LOG(ERR, EAL, "Cannot read capability ID from PCI " | |
97 | "config space!\n"); | |
98 | return -1; | |
99 | } | |
100 | ||
101 | /* we need first byte */ | |
102 | cap_id = reg & 0xFF; | |
103 | ||
104 | /* if we haven't reached MSI-X, check next capability */ | |
105 | if (cap_id != PCI_CAP_ID_MSIX) { | |
106 | ret = pread64(fd, ®, sizeof(reg), | |
107 | VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + | |
108 | cap_offset); | |
109 | if (ret != sizeof(reg)) { | |
110 | RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI " | |
111 | "config space!\n"); | |
112 | return -1; | |
113 | } | |
114 | ||
115 | /* we need second byte */ | |
116 | cap_offset = (reg & 0xFF00) >> 8; | |
117 | ||
118 | continue; | |
119 | } | |
120 | /* else, read table offset */ | |
121 | else { | |
122 | /* table offset resides in the next 4 bytes */ | |
123 | ret = pread64(fd, ®, sizeof(reg), | |
124 | VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + | |
125 | cap_offset + 4); | |
126 | if (ret != sizeof(reg)) { | |
127 | RTE_LOG(ERR, EAL, "Cannot read table offset from PCI config " | |
128 | "space!\n"); | |
129 | return -1; | |
130 | } | |
131 | ||
132 | ret = pread64(fd, &flags, sizeof(flags), | |
133 | VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + | |
134 | cap_offset + 2); | |
135 | if (ret != sizeof(flags)) { | |
136 | RTE_LOG(ERR, EAL, "Cannot read table flags from PCI config " | |
137 | "space!\n"); | |
138 | return -1; | |
139 | } | |
140 | ||
141 | msix_table->bar_index = reg & RTE_PCI_MSIX_TABLE_BIR; | |
142 | msix_table->offset = reg & RTE_PCI_MSIX_TABLE_OFFSET; | |
143 | msix_table->size = | |
144 | 16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE)); | |
145 | ||
146 | return 0; | |
147 | } | |
148 | } | |
149 | return 0; | |
150 | } | |
151 | ||
152 | /* set PCI bus mastering */ | |
153 | static int | |
154 | pci_vfio_set_bus_master(int dev_fd, bool op) | |
155 | { | |
156 | uint16_t reg; | |
157 | int ret; | |
158 | ||
159 | ret = pread64(dev_fd, ®, sizeof(reg), | |
160 | VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + | |
161 | PCI_COMMAND); | |
162 | if (ret != sizeof(reg)) { | |
163 | RTE_LOG(ERR, EAL, "Cannot read command from PCI config space!\n"); | |
164 | return -1; | |
165 | } | |
166 | ||
167 | if (op) | |
168 | /* set the master bit */ | |
169 | reg |= PCI_COMMAND_MASTER; | |
170 | else | |
171 | reg &= ~(PCI_COMMAND_MASTER); | |
172 | ||
173 | ret = pwrite64(dev_fd, ®, sizeof(reg), | |
174 | VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + | |
175 | PCI_COMMAND); | |
176 | ||
177 | if (ret != sizeof(reg)) { | |
178 | RTE_LOG(ERR, EAL, "Cannot write command to PCI config space!\n"); | |
179 | return -1; | |
180 | } | |
181 | ||
182 | return 0; | |
183 | } | |
184 | ||
185 | /* set up interrupt support (but not enable interrupts) */ | |
186 | static int | |
187 | pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd) | |
188 | { | |
189 | int i, ret, intr_idx; | |
190 | enum rte_intr_mode intr_mode; | |
191 | ||
192 | /* default to invalid index */ | |
193 | intr_idx = VFIO_PCI_NUM_IRQS; | |
194 | ||
195 | /* Get default / configured intr_mode */ | |
196 | intr_mode = rte_eal_vfio_intr_mode(); | |
197 | ||
198 | /* get interrupt type from internal config (MSI-X by default, can be | |
199 | * overridden from the command line | |
200 | */ | |
201 | switch (intr_mode) { | |
202 | case RTE_INTR_MODE_MSIX: | |
203 | intr_idx = VFIO_PCI_MSIX_IRQ_INDEX; | |
204 | break; | |
205 | case RTE_INTR_MODE_MSI: | |
206 | intr_idx = VFIO_PCI_MSI_IRQ_INDEX; | |
207 | break; | |
208 | case RTE_INTR_MODE_LEGACY: | |
209 | intr_idx = VFIO_PCI_INTX_IRQ_INDEX; | |
210 | break; | |
211 | /* don't do anything if we want to automatically determine interrupt type */ | |
212 | case RTE_INTR_MODE_NONE: | |
213 | break; | |
214 | default: | |
215 | RTE_LOG(ERR, EAL, " unknown default interrupt type!\n"); | |
216 | return -1; | |
217 | } | |
218 | ||
219 | /* start from MSI-X interrupt type */ | |
220 | for (i = VFIO_PCI_MSIX_IRQ_INDEX; i >= 0; i--) { | |
221 | struct vfio_irq_info irq = { .argsz = sizeof(irq) }; | |
222 | int fd = -1; | |
223 | ||
224 | /* skip interrupt modes we don't want */ | |
225 | if (intr_mode != RTE_INTR_MODE_NONE && | |
226 | i != intr_idx) | |
227 | continue; | |
228 | ||
229 | irq.index = i; | |
230 | ||
231 | ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_IRQ_INFO, &irq); | |
232 | if (ret < 0) { | |
233 | RTE_LOG(ERR, EAL, " cannot get IRQ info, " | |
234 | "error %i (%s)\n", errno, strerror(errno)); | |
235 | return -1; | |
236 | } | |
237 | ||
238 | /* if this vector cannot be used with eventfd, fail if we explicitly | |
239 | * specified interrupt type, otherwise continue */ | |
240 | if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) == 0) { | |
241 | if (intr_mode != RTE_INTR_MODE_NONE) { | |
242 | RTE_LOG(ERR, EAL, | |
243 | " interrupt vector does not support eventfd!\n"); | |
244 | return -1; | |
245 | } else | |
246 | continue; | |
247 | } | |
248 | ||
249 | /* set up an eventfd for interrupts */ | |
250 | fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); | |
251 | if (fd < 0) { | |
252 | RTE_LOG(ERR, EAL, " cannot set up eventfd, " | |
253 | "error %i (%s)\n", errno, strerror(errno)); | |
254 | return -1; | |
255 | } | |
256 | ||
257 | dev->intr_handle.fd = fd; | |
258 | dev->intr_handle.vfio_dev_fd = vfio_dev_fd; | |
259 | ||
260 | switch (i) { | |
261 | case VFIO_PCI_MSIX_IRQ_INDEX: | |
262 | intr_mode = RTE_INTR_MODE_MSIX; | |
263 | dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSIX; | |
264 | break; | |
265 | case VFIO_PCI_MSI_IRQ_INDEX: | |
266 | intr_mode = RTE_INTR_MODE_MSI; | |
267 | dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSI; | |
268 | break; | |
269 | case VFIO_PCI_INTX_IRQ_INDEX: | |
270 | intr_mode = RTE_INTR_MODE_LEGACY; | |
271 | dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_LEGACY; | |
272 | break; | |
273 | default: | |
274 | RTE_LOG(ERR, EAL, " unknown interrupt type!\n"); | |
275 | return -1; | |
276 | } | |
277 | ||
278 | return 0; | |
279 | } | |
280 | ||
281 | /* if we're here, we haven't found a suitable interrupt vector */ | |
282 | return -1; | |
283 | } | |
284 | ||
9f95a23c TL |
285 | #ifdef HAVE_VFIO_DEV_REQ_INTERFACE |
286 | /* | |
287 | * Spinlock for device hot-unplug failure handling. | |
288 | * If it tries to access bus or device, such as handle sigbus on bus | |
289 | * or handle memory failure for device, just need to use this lock. | |
290 | * It could protect the bus and the device to avoid race condition. | |
291 | */ | |
292 | static rte_spinlock_t failure_handle_lock = RTE_SPINLOCK_INITIALIZER; | |
293 | ||
294 | static void | |
295 | pci_vfio_req_handler(void *param) | |
296 | { | |
297 | struct rte_bus *bus; | |
298 | int ret; | |
299 | struct rte_device *device = (struct rte_device *)param; | |
300 | ||
301 | rte_spinlock_lock(&failure_handle_lock); | |
302 | bus = rte_bus_find_by_device(device); | |
303 | if (bus == NULL) { | |
304 | RTE_LOG(ERR, EAL, "Cannot find bus for device (%s)\n", | |
305 | device->name); | |
306 | goto handle_end; | |
307 | } | |
308 | ||
309 | /* | |
310 | * vfio kernel module request user space to release allocated | |
311 | * resources before device be deleted in kernel, so it can directly | |
312 | * call the vfio bus hot-unplug handler to process it. | |
313 | */ | |
314 | ret = bus->hot_unplug_handler(device); | |
315 | if (ret) | |
316 | RTE_LOG(ERR, EAL, | |
317 | "Can not handle hot-unplug for device (%s)\n", | |
318 | device->name); | |
319 | handle_end: | |
320 | rte_spinlock_unlock(&failure_handle_lock); | |
321 | } | |
322 | ||
323 | /* enable notifier (only enable req now) */ | |
324 | static int | |
325 | pci_vfio_enable_notifier(struct rte_pci_device *dev, int vfio_dev_fd) | |
326 | { | |
327 | int ret; | |
328 | int fd = -1; | |
329 | ||
330 | /* set up an eventfd for req notifier */ | |
331 | fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); | |
332 | if (fd < 0) { | |
333 | RTE_LOG(ERR, EAL, "Cannot set up eventfd, error %i (%s)\n", | |
334 | errno, strerror(errno)); | |
335 | return -1; | |
336 | } | |
337 | ||
338 | dev->vfio_req_intr_handle.fd = fd; | |
339 | dev->vfio_req_intr_handle.type = RTE_INTR_HANDLE_VFIO_REQ; | |
340 | dev->vfio_req_intr_handle.vfio_dev_fd = vfio_dev_fd; | |
341 | ||
342 | ret = rte_intr_callback_register(&dev->vfio_req_intr_handle, | |
343 | pci_vfio_req_handler, | |
344 | (void *)&dev->device); | |
345 | if (ret) { | |
346 | RTE_LOG(ERR, EAL, "Fail to register req notifier handler.\n"); | |
347 | goto error; | |
348 | } | |
349 | ||
350 | ret = rte_intr_enable(&dev->vfio_req_intr_handle); | |
351 | if (ret) { | |
352 | RTE_LOG(ERR, EAL, "Fail to enable req notifier.\n"); | |
353 | ret = rte_intr_callback_unregister(&dev->vfio_req_intr_handle, | |
354 | pci_vfio_req_handler, | |
355 | (void *)&dev->device); | |
356 | if (ret < 0) | |
357 | RTE_LOG(ERR, EAL, | |
358 | "Fail to unregister req notifier handler.\n"); | |
359 | goto error; | |
360 | } | |
361 | ||
362 | return 0; | |
363 | error: | |
364 | close(fd); | |
365 | ||
366 | dev->vfio_req_intr_handle.fd = -1; | |
367 | dev->vfio_req_intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; | |
368 | dev->vfio_req_intr_handle.vfio_dev_fd = -1; | |
369 | ||
370 | return -1; | |
371 | } | |
372 | ||
373 | /* disable notifier (only disable req now) */ | |
374 | static int | |
375 | pci_vfio_disable_notifier(struct rte_pci_device *dev) | |
376 | { | |
377 | int ret; | |
378 | ||
379 | ret = rte_intr_disable(&dev->vfio_req_intr_handle); | |
380 | if (ret) { | |
381 | RTE_LOG(ERR, EAL, "fail to disable req notifier.\n"); | |
382 | return -1; | |
383 | } | |
384 | ||
385 | ret = rte_intr_callback_unregister(&dev->vfio_req_intr_handle, | |
386 | pci_vfio_req_handler, | |
387 | (void *)&dev->device); | |
388 | if (ret < 0) { | |
389 | RTE_LOG(ERR, EAL, | |
390 | "fail to unregister req notifier handler.\n"); | |
391 | return -1; | |
392 | } | |
393 | ||
394 | close(dev->vfio_req_intr_handle.fd); | |
395 | ||
396 | dev->vfio_req_intr_handle.fd = -1; | |
397 | dev->vfio_req_intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; | |
398 | dev->vfio_req_intr_handle.vfio_dev_fd = -1; | |
399 | ||
400 | return 0; | |
401 | } | |
402 | #endif | |
403 | ||
11fdf7f2 TL |
404 | static int |
405 | pci_vfio_is_ioport_bar(int vfio_dev_fd, int bar_index) | |
406 | { | |
407 | uint32_t ioport_bar; | |
408 | int ret; | |
409 | ||
410 | ret = pread64(vfio_dev_fd, &ioport_bar, sizeof(ioport_bar), | |
411 | VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) | |
412 | + PCI_BASE_ADDRESS_0 + bar_index*4); | |
413 | if (ret != sizeof(ioport_bar)) { | |
414 | RTE_LOG(ERR, EAL, "Cannot read command (%x) from config space!\n", | |
415 | PCI_BASE_ADDRESS_0 + bar_index*4); | |
416 | return -1; | |
417 | } | |
418 | ||
419 | return (ioport_bar & PCI_BASE_ADDRESS_SPACE_IO) != 0; | |
420 | } | |
421 | ||
422 | static int | |
423 | pci_rte_vfio_setup_device(struct rte_pci_device *dev, int vfio_dev_fd) | |
424 | { | |
425 | if (pci_vfio_setup_interrupts(dev, vfio_dev_fd) != 0) { | |
426 | RTE_LOG(ERR, EAL, "Error setting up interrupts!\n"); | |
427 | return -1; | |
428 | } | |
429 | ||
430 | /* set bus mastering for the device */ | |
431 | if (pci_vfio_set_bus_master(vfio_dev_fd, true)) { | |
432 | RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n"); | |
433 | return -1; | |
434 | } | |
435 | ||
436 | /* | |
437 | * Reset the device. If the device is not capable of resetting, | |
438 | * then it updates errno as EINVAL. | |
439 | */ | |
440 | if (ioctl(vfio_dev_fd, VFIO_DEVICE_RESET) && errno != EINVAL) { | |
441 | RTE_LOG(ERR, EAL, "Unable to reset device! Error: %d (%s)\n", | |
442 | errno, strerror(errno)); | |
443 | return -1; | |
444 | } | |
445 | ||
446 | return 0; | |
447 | } | |
448 | ||
449 | static int | |
450 | pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res, | |
451 | int bar_index, int additional_flags) | |
452 | { | |
453 | struct memreg { | |
f67539c2 TL |
454 | uint64_t offset; |
455 | size_t size; | |
11fdf7f2 TL |
456 | } memreg[2] = {}; |
457 | void *bar_addr; | |
458 | struct pci_msix_table *msix_table = &vfio_res->msix_table; | |
459 | struct pci_map *bar = &vfio_res->maps[bar_index]; | |
460 | ||
9f95a23c TL |
461 | if (bar->size == 0) { |
462 | RTE_LOG(DEBUG, EAL, "Bar size is 0, skip BAR%d\n", bar_index); | |
11fdf7f2 | 463 | return 0; |
9f95a23c | 464 | } |
11fdf7f2 TL |
465 | |
466 | if (msix_table->bar_index == bar_index) { | |
467 | /* | |
468 | * VFIO will not let us map the MSI-X table, | |
469 | * but we can map around it. | |
470 | */ | |
471 | uint32_t table_start = msix_table->offset; | |
472 | uint32_t table_end = table_start + msix_table->size; | |
9f95a23c TL |
473 | table_end = RTE_ALIGN(table_end, PAGE_SIZE); |
474 | table_start = RTE_ALIGN_FLOOR(table_start, PAGE_SIZE); | |
475 | ||
476 | /* If page-aligned start of MSI-X table is less than the | |
477 | * actual MSI-X table start address, reassign to the actual | |
478 | * start address. | |
479 | */ | |
480 | if (table_start < msix_table->offset) | |
481 | table_start = msix_table->offset; | |
11fdf7f2 TL |
482 | |
483 | if (table_start == 0 && table_end >= bar->size) { | |
484 | /* Cannot map this BAR */ | |
485 | RTE_LOG(DEBUG, EAL, "Skipping BAR%d\n", bar_index); | |
486 | bar->size = 0; | |
487 | bar->addr = 0; | |
488 | return 0; | |
489 | } | |
490 | ||
491 | memreg[0].offset = bar->offset; | |
492 | memreg[0].size = table_start; | |
9f95a23c TL |
493 | if (bar->size < table_end) { |
494 | /* | |
495 | * If MSI-X table end is beyond BAR end, don't attempt | |
496 | * to perform second mapping. | |
497 | */ | |
498 | memreg[1].offset = 0; | |
499 | memreg[1].size = 0; | |
500 | } else { | |
501 | memreg[1].offset = bar->offset + table_end; | |
502 | memreg[1].size = bar->size - table_end; | |
503 | } | |
11fdf7f2 TL |
504 | |
505 | RTE_LOG(DEBUG, EAL, | |
506 | "Trying to map BAR%d that contains the MSI-X " | |
507 | "table. Trying offsets: " | |
f67539c2 TL |
508 | "0x%04" PRIx64 ":0x%04zx, 0x%04" PRIx64 ":0x%04zx\n", |
509 | bar_index, | |
11fdf7f2 TL |
510 | memreg[0].offset, memreg[0].size, |
511 | memreg[1].offset, memreg[1].size); | |
512 | } else { | |
513 | memreg[0].offset = bar->offset; | |
514 | memreg[0].size = bar->size; | |
515 | } | |
516 | ||
517 | /* reserve the address using an inaccessible mapping */ | |
518 | bar_addr = mmap(bar->addr, bar->size, 0, MAP_PRIVATE | | |
519 | MAP_ANONYMOUS | additional_flags, -1, 0); | |
520 | if (bar_addr != MAP_FAILED) { | |
521 | void *map_addr = NULL; | |
522 | if (memreg[0].size) { | |
523 | /* actual map of first part */ | |
524 | map_addr = pci_map_resource(bar_addr, vfio_dev_fd, | |
525 | memreg[0].offset, | |
526 | memreg[0].size, | |
527 | MAP_FIXED); | |
528 | } | |
529 | ||
530 | /* if there's a second part, try to map it */ | |
531 | if (map_addr != MAP_FAILED | |
532 | && memreg[1].offset && memreg[1].size) { | |
533 | void *second_addr = RTE_PTR_ADD(bar_addr, | |
f67539c2 TL |
534 | (uintptr_t)(memreg[1].offset - |
535 | bar->offset)); | |
11fdf7f2 TL |
536 | map_addr = pci_map_resource(second_addr, |
537 | vfio_dev_fd, | |
538 | memreg[1].offset, | |
539 | memreg[1].size, | |
540 | MAP_FIXED); | |
541 | } | |
542 | ||
543 | if (map_addr == MAP_FAILED || !map_addr) { | |
544 | munmap(bar_addr, bar->size); | |
545 | bar_addr = MAP_FAILED; | |
546 | RTE_LOG(ERR, EAL, "Failed to map pci BAR%d\n", | |
547 | bar_index); | |
548 | return -1; | |
549 | } | |
550 | } else { | |
551 | RTE_LOG(ERR, EAL, | |
552 | "Failed to create inaccessible mapping for BAR%d\n", | |
553 | bar_index); | |
554 | return -1; | |
555 | } | |
556 | ||
557 | bar->addr = bar_addr; | |
558 | return 0; | |
559 | } | |
560 | ||
9f95a23c TL |
561 | /* |
562 | * region info may contain capability headers, so we need to keep reallocating | |
563 | * the memory until we match allocated memory size with argsz. | |
564 | */ | |
565 | static int | |
566 | pci_vfio_get_region_info(int vfio_dev_fd, struct vfio_region_info **info, | |
567 | int region) | |
568 | { | |
569 | struct vfio_region_info *ri; | |
570 | size_t argsz = sizeof(*ri); | |
571 | int ret; | |
572 | ||
573 | ri = malloc(sizeof(*ri)); | |
574 | if (ri == NULL) { | |
575 | RTE_LOG(ERR, EAL, "Cannot allocate memory for region info\n"); | |
576 | return -1; | |
577 | } | |
578 | again: | |
579 | memset(ri, 0, argsz); | |
580 | ri->argsz = argsz; | |
581 | ri->index = region; | |
582 | ||
583 | ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ri); | |
584 | if (ret < 0) { | |
585 | free(ri); | |
586 | return ret; | |
587 | } | |
588 | if (ri->argsz != argsz) { | |
589 | struct vfio_region_info *tmp; | |
590 | ||
591 | argsz = ri->argsz; | |
592 | tmp = realloc(ri, argsz); | |
593 | ||
594 | if (tmp == NULL) { | |
595 | /* realloc failed but the ri is still there */ | |
596 | free(ri); | |
597 | RTE_LOG(ERR, EAL, "Cannot reallocate memory for region info\n"); | |
598 | return -1; | |
599 | } | |
600 | ri = tmp; | |
601 | goto again; | |
602 | } | |
603 | *info = ri; | |
604 | ||
605 | return 0; | |
606 | } | |
607 | ||
608 | static struct vfio_info_cap_header * | |
609 | pci_vfio_info_cap(struct vfio_region_info *info, int cap) | |
610 | { | |
611 | struct vfio_info_cap_header *h; | |
612 | size_t offset; | |
613 | ||
614 | if ((info->flags & RTE_VFIO_INFO_FLAG_CAPS) == 0) { | |
615 | /* VFIO info does not advertise capabilities */ | |
616 | return NULL; | |
617 | } | |
618 | ||
619 | offset = VFIO_CAP_OFFSET(info); | |
620 | while (offset != 0) { | |
621 | h = RTE_PTR_ADD(info, offset); | |
622 | if (h->id == cap) | |
623 | return h; | |
624 | offset = h->next; | |
625 | } | |
626 | return NULL; | |
627 | } | |
628 | ||
629 | static int | |
630 | pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region) | |
631 | { | |
632 | struct vfio_region_info *info; | |
633 | int ret; | |
634 | ||
635 | ret = pci_vfio_get_region_info(vfio_dev_fd, &info, msix_region); | |
636 | if (ret < 0) | |
637 | return -1; | |
638 | ||
639 | ret = pci_vfio_info_cap(info, RTE_VFIO_CAP_MSIX_MAPPABLE) != NULL; | |
640 | ||
641 | /* cleanup */ | |
642 | free(info); | |
643 | ||
644 | return ret; | |
645 | } | |
646 | ||
647 | ||
11fdf7f2 TL |
648 | static int |
649 | pci_vfio_map_resource_primary(struct rte_pci_device *dev) | |
650 | { | |
651 | struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; | |
652 | char pci_addr[PATH_MAX] = {0}; | |
653 | int vfio_dev_fd; | |
654 | struct rte_pci_addr *loc = &dev->addr; | |
655 | int i, ret; | |
656 | struct mapped_pci_resource *vfio_res = NULL; | |
657 | struct mapped_pci_res_list *vfio_res_list = | |
658 | RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); | |
659 | ||
660 | struct pci_map *maps; | |
661 | ||
662 | dev->intr_handle.fd = -1; | |
9f95a23c TL |
663 | #ifdef HAVE_VFIO_DEV_REQ_INTERFACE |
664 | dev->vfio_req_intr_handle.fd = -1; | |
665 | #endif | |
11fdf7f2 TL |
666 | |
667 | /* store PCI address string */ | |
668 | snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, | |
669 | loc->domain, loc->bus, loc->devid, loc->function); | |
670 | ||
671 | ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr, | |
672 | &vfio_dev_fd, &device_info); | |
673 | if (ret) | |
674 | return ret; | |
675 | ||
676 | /* allocate vfio_res and get region info */ | |
677 | vfio_res = rte_zmalloc("VFIO_RES", sizeof(*vfio_res), 0); | |
678 | if (vfio_res == NULL) { | |
679 | RTE_LOG(ERR, EAL, | |
9f95a23c | 680 | "%s(): cannot store vfio mmap details\n", __func__); |
11fdf7f2 TL |
681 | goto err_vfio_dev_fd; |
682 | } | |
683 | memcpy(&vfio_res->pci_addr, &dev->addr, sizeof(vfio_res->pci_addr)); | |
684 | ||
685 | /* get number of registers (up to BAR5) */ | |
686 | vfio_res->nb_maps = RTE_MIN((int) device_info.num_regions, | |
687 | VFIO_PCI_BAR5_REGION_INDEX + 1); | |
688 | ||
689 | /* map BARs */ | |
690 | maps = vfio_res->maps; | |
691 | ||
692 | vfio_res->msix_table.bar_index = -1; | |
693 | /* get MSI-X BAR, if any (we have to know where it is because we can't | |
694 | * easily mmap it when using VFIO) | |
695 | */ | |
696 | ret = pci_vfio_get_msix_bar(vfio_dev_fd, &vfio_res->msix_table); | |
697 | if (ret < 0) { | |
698 | RTE_LOG(ERR, EAL, " %s cannot get MSI-X BAR number!\n", | |
699 | pci_addr); | |
9f95a23c TL |
700 | goto err_vfio_res; |
701 | } | |
702 | /* if we found our MSI-X BAR region, check if we can mmap it */ | |
703 | if (vfio_res->msix_table.bar_index != -1) { | |
704 | int ret = pci_vfio_msix_is_mappable(vfio_dev_fd, | |
705 | vfio_res->msix_table.bar_index); | |
706 | if (ret < 0) { | |
707 | RTE_LOG(ERR, EAL, "Couldn't check if MSI-X BAR is mappable\n"); | |
708 | goto err_vfio_res; | |
709 | } else if (ret != 0) { | |
710 | /* we can map it, so we don't care where it is */ | |
711 | RTE_LOG(DEBUG, EAL, "VFIO reports MSI-X BAR as mappable\n"); | |
712 | vfio_res->msix_table.bar_index = -1; | |
713 | } | |
11fdf7f2 TL |
714 | } |
715 | ||
716 | for (i = 0; i < (int) vfio_res->nb_maps; i++) { | |
9f95a23c | 717 | struct vfio_region_info *reg = NULL; |
11fdf7f2 TL |
718 | void *bar_addr; |
719 | ||
9f95a23c TL |
720 | ret = pci_vfio_get_region_info(vfio_dev_fd, ®, i); |
721 | if (ret < 0) { | |
11fdf7f2 | 722 | RTE_LOG(ERR, EAL, " %s cannot get device region info " |
9f95a23c TL |
723 | "error %i (%s)\n", pci_addr, errno, |
724 | strerror(errno)); | |
11fdf7f2 TL |
725 | goto err_vfio_res; |
726 | } | |
727 | ||
728 | /* chk for io port region */ | |
729 | ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i); | |
9f95a23c TL |
730 | if (ret < 0) { |
731 | free(reg); | |
11fdf7f2 | 732 | goto err_vfio_res; |
9f95a23c | 733 | } else if (ret) { |
11fdf7f2 TL |
734 | RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n", |
735 | i); | |
9f95a23c | 736 | free(reg); |
11fdf7f2 TL |
737 | continue; |
738 | } | |
739 | ||
740 | /* skip non-mmapable BARs */ | |
9f95a23c TL |
741 | if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) { |
742 | free(reg); | |
11fdf7f2 | 743 | continue; |
9f95a23c | 744 | } |
11fdf7f2 TL |
745 | |
746 | /* try mapping somewhere close to the end of hugepages */ | |
747 | if (pci_map_addr == NULL) | |
748 | pci_map_addr = pci_find_max_end_va(); | |
749 | ||
750 | bar_addr = pci_map_addr; | |
9f95a23c | 751 | pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size); |
11fdf7f2 | 752 | |
f67539c2 TL |
753 | pci_map_addr = RTE_PTR_ALIGN(pci_map_addr, |
754 | sysconf(_SC_PAGE_SIZE)); | |
755 | ||
11fdf7f2 | 756 | maps[i].addr = bar_addr; |
9f95a23c TL |
757 | maps[i].offset = reg->offset; |
758 | maps[i].size = reg->size; | |
11fdf7f2 TL |
759 | maps[i].path = NULL; /* vfio doesn't have per-resource paths */ |
760 | ||
761 | ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0); | |
762 | if (ret < 0) { | |
763 | RTE_LOG(ERR, EAL, " %s mapping BAR%i failed: %s\n", | |
764 | pci_addr, i, strerror(errno)); | |
9f95a23c | 765 | free(reg); |
11fdf7f2 TL |
766 | goto err_vfio_res; |
767 | } | |
768 | ||
769 | dev->mem_resource[i].addr = maps[i].addr; | |
9f95a23c TL |
770 | |
771 | free(reg); | |
11fdf7f2 TL |
772 | } |
773 | ||
774 | if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) { | |
775 | RTE_LOG(ERR, EAL, " %s setup device failed\n", pci_addr); | |
776 | goto err_vfio_res; | |
777 | } | |
778 | ||
9f95a23c TL |
779 | #ifdef HAVE_VFIO_DEV_REQ_INTERFACE |
780 | if (pci_vfio_enable_notifier(dev, vfio_dev_fd) != 0) { | |
781 | RTE_LOG(ERR, EAL, "Error setting up notifier!\n"); | |
782 | goto err_vfio_res; | |
783 | } | |
784 | ||
785 | #endif | |
11fdf7f2 TL |
786 | TAILQ_INSERT_TAIL(vfio_res_list, vfio_res, next); |
787 | ||
788 | return 0; | |
789 | err_vfio_res: | |
790 | rte_free(vfio_res); | |
791 | err_vfio_dev_fd: | |
792 | close(vfio_dev_fd); | |
793 | return -1; | |
794 | } | |
795 | ||
796 | static int | |
797 | pci_vfio_map_resource_secondary(struct rte_pci_device *dev) | |
798 | { | |
799 | struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; | |
800 | char pci_addr[PATH_MAX] = {0}; | |
801 | int vfio_dev_fd; | |
802 | struct rte_pci_addr *loc = &dev->addr; | |
803 | int i, ret; | |
804 | struct mapped_pci_resource *vfio_res = NULL; | |
805 | struct mapped_pci_res_list *vfio_res_list = | |
806 | RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); | |
807 | ||
808 | struct pci_map *maps; | |
809 | ||
810 | dev->intr_handle.fd = -1; | |
9f95a23c TL |
811 | #ifdef HAVE_VFIO_DEV_REQ_INTERFACE |
812 | dev->vfio_req_intr_handle.fd = -1; | |
813 | #endif | |
11fdf7f2 TL |
814 | |
815 | /* store PCI address string */ | |
816 | snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, | |
817 | loc->domain, loc->bus, loc->devid, loc->function); | |
818 | ||
11fdf7f2 TL |
819 | /* if we're in a secondary process, just find our tailq entry */ |
820 | TAILQ_FOREACH(vfio_res, vfio_res_list, next) { | |
821 | if (rte_pci_addr_cmp(&vfio_res->pci_addr, | |
822 | &dev->addr)) | |
823 | continue; | |
824 | break; | |
825 | } | |
826 | /* if we haven't found our tailq entry, something's wrong */ | |
827 | if (vfio_res == NULL) { | |
828 | RTE_LOG(ERR, EAL, " %s cannot find TAILQ entry for PCI device!\n", | |
829 | pci_addr); | |
9f95a23c | 830 | return -1; |
11fdf7f2 TL |
831 | } |
832 | ||
9f95a23c TL |
833 | ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr, |
834 | &vfio_dev_fd, &device_info); | |
835 | if (ret) | |
836 | return ret; | |
837 | ||
11fdf7f2 TL |
838 | /* map BARs */ |
839 | maps = vfio_res->maps; | |
840 | ||
841 | for (i = 0; i < (int) vfio_res->nb_maps; i++) { | |
842 | ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, MAP_FIXED); | |
843 | if (ret < 0) { | |
844 | RTE_LOG(ERR, EAL, " %s mapping BAR%i failed: %s\n", | |
845 | pci_addr, i, strerror(errno)); | |
846 | goto err_vfio_dev_fd; | |
847 | } | |
848 | ||
849 | dev->mem_resource[i].addr = maps[i].addr; | |
850 | } | |
851 | ||
852 | /* we need save vfio_dev_fd, so it can be used during release */ | |
853 | dev->intr_handle.vfio_dev_fd = vfio_dev_fd; | |
9f95a23c TL |
854 | #ifdef HAVE_VFIO_DEV_REQ_INTERFACE |
855 | dev->vfio_req_intr_handle.vfio_dev_fd = vfio_dev_fd; | |
856 | #endif | |
11fdf7f2 TL |
857 | |
858 | return 0; | |
859 | err_vfio_dev_fd: | |
860 | close(vfio_dev_fd); | |
861 | return -1; | |
862 | } | |
863 | ||
864 | /* | |
865 | * map the PCI resources of a PCI device in virtual memory (VFIO version). | |
866 | * primary and secondary processes follow almost exactly the same path | |
867 | */ | |
868 | int | |
869 | pci_vfio_map_resource(struct rte_pci_device *dev) | |
870 | { | |
871 | if (rte_eal_process_type() == RTE_PROC_PRIMARY) | |
872 | return pci_vfio_map_resource_primary(dev); | |
873 | else | |
874 | return pci_vfio_map_resource_secondary(dev); | |
875 | } | |
876 | ||
877 | static struct mapped_pci_resource * | |
878 | find_and_unmap_vfio_resource(struct mapped_pci_res_list *vfio_res_list, | |
879 | struct rte_pci_device *dev, | |
880 | const char *pci_addr) | |
881 | { | |
882 | struct mapped_pci_resource *vfio_res = NULL; | |
883 | struct pci_map *maps; | |
884 | int i; | |
885 | ||
886 | /* Get vfio_res */ | |
887 | TAILQ_FOREACH(vfio_res, vfio_res_list, next) { | |
888 | if (rte_pci_addr_cmp(&vfio_res->pci_addr, &dev->addr)) | |
889 | continue; | |
890 | break; | |
891 | } | |
892 | ||
893 | if (vfio_res == NULL) | |
894 | return vfio_res; | |
895 | ||
896 | RTE_LOG(INFO, EAL, "Releasing pci mapped resource for %s\n", | |
897 | pci_addr); | |
898 | ||
899 | maps = vfio_res->maps; | |
900 | for (i = 0; i < (int) vfio_res->nb_maps; i++) { | |
901 | ||
902 | /* | |
903 | * We do not need to be aware of MSI-X table BAR mappings as | |
904 | * when mapping. Just using current maps array is enough | |
905 | */ | |
906 | if (maps[i].addr) { | |
907 | RTE_LOG(INFO, EAL, "Calling pci_unmap_resource for %s at %p\n", | |
908 | pci_addr, maps[i].addr); | |
909 | pci_unmap_resource(maps[i].addr, maps[i].size); | |
910 | } | |
911 | } | |
912 | ||
913 | return vfio_res; | |
914 | } | |
915 | ||
916 | static int | |
917 | pci_vfio_unmap_resource_primary(struct rte_pci_device *dev) | |
918 | { | |
919 | char pci_addr[PATH_MAX] = {0}; | |
920 | struct rte_pci_addr *loc = &dev->addr; | |
921 | struct mapped_pci_resource *vfio_res = NULL; | |
922 | struct mapped_pci_res_list *vfio_res_list; | |
923 | int ret; | |
924 | ||
925 | /* store PCI address string */ | |
926 | snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, | |
927 | loc->domain, loc->bus, loc->devid, loc->function); | |
928 | ||
9f95a23c TL |
929 | #ifdef HAVE_VFIO_DEV_REQ_INTERFACE |
930 | ret = pci_vfio_disable_notifier(dev); | |
931 | if (ret) { | |
932 | RTE_LOG(ERR, EAL, "fail to disable req notifier.\n"); | |
933 | return -1; | |
934 | } | |
935 | ||
936 | #endif | |
11fdf7f2 TL |
937 | if (close(dev->intr_handle.fd) < 0) { |
938 | RTE_LOG(INFO, EAL, "Error when closing eventfd file descriptor for %s\n", | |
939 | pci_addr); | |
940 | return -1; | |
941 | } | |
942 | ||
943 | if (pci_vfio_set_bus_master(dev->intr_handle.vfio_dev_fd, false)) { | |
944 | RTE_LOG(ERR, EAL, " %s cannot unset bus mastering for PCI device!\n", | |
945 | pci_addr); | |
946 | return -1; | |
947 | } | |
948 | ||
949 | ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr, | |
950 | dev->intr_handle.vfio_dev_fd); | |
951 | if (ret < 0) { | |
952 | RTE_LOG(ERR, EAL, | |
953 | "%s(): cannot release device\n", __func__); | |
954 | return ret; | |
955 | } | |
956 | ||
957 | vfio_res_list = | |
958 | RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); | |
959 | vfio_res = find_and_unmap_vfio_resource(vfio_res_list, dev, pci_addr); | |
960 | ||
961 | /* if we haven't found our tailq entry, something's wrong */ | |
962 | if (vfio_res == NULL) { | |
963 | RTE_LOG(ERR, EAL, " %s cannot find TAILQ entry for PCI device!\n", | |
964 | pci_addr); | |
965 | return -1; | |
966 | } | |
967 | ||
968 | TAILQ_REMOVE(vfio_res_list, vfio_res, next); | |
969 | ||
970 | return 0; | |
971 | } | |
972 | ||
973 | static int | |
974 | pci_vfio_unmap_resource_secondary(struct rte_pci_device *dev) | |
975 | { | |
976 | char pci_addr[PATH_MAX] = {0}; | |
977 | struct rte_pci_addr *loc = &dev->addr; | |
978 | struct mapped_pci_resource *vfio_res = NULL; | |
979 | struct mapped_pci_res_list *vfio_res_list; | |
980 | int ret; | |
981 | ||
982 | /* store PCI address string */ | |
983 | snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, | |
984 | loc->domain, loc->bus, loc->devid, loc->function); | |
985 | ||
986 | ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr, | |
987 | dev->intr_handle.vfio_dev_fd); | |
988 | if (ret < 0) { | |
989 | RTE_LOG(ERR, EAL, | |
990 | "%s(): cannot release device\n", __func__); | |
991 | return ret; | |
992 | } | |
993 | ||
994 | vfio_res_list = | |
995 | RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); | |
996 | vfio_res = find_and_unmap_vfio_resource(vfio_res_list, dev, pci_addr); | |
997 | ||
998 | /* if we haven't found our tailq entry, something's wrong */ | |
999 | if (vfio_res == NULL) { | |
1000 | RTE_LOG(ERR, EAL, " %s cannot find TAILQ entry for PCI device!\n", | |
1001 | pci_addr); | |
1002 | return -1; | |
1003 | } | |
1004 | ||
1005 | return 0; | |
1006 | } | |
1007 | ||
1008 | int | |
1009 | pci_vfio_unmap_resource(struct rte_pci_device *dev) | |
1010 | { | |
1011 | if (rte_eal_process_type() == RTE_PROC_PRIMARY) | |
1012 | return pci_vfio_unmap_resource_primary(dev); | |
1013 | else | |
1014 | return pci_vfio_unmap_resource_secondary(dev); | |
1015 | } | |
1016 | ||
1017 | int | |
1018 | pci_vfio_ioport_map(struct rte_pci_device *dev, int bar, | |
1019 | struct rte_pci_ioport *p) | |
1020 | { | |
1021 | if (bar < VFIO_PCI_BAR0_REGION_INDEX || | |
1022 | bar > VFIO_PCI_BAR5_REGION_INDEX) { | |
1023 | RTE_LOG(ERR, EAL, "invalid bar (%d)!\n", bar); | |
1024 | return -1; | |
1025 | } | |
1026 | ||
1027 | p->dev = dev; | |
1028 | p->base = VFIO_GET_REGION_ADDR(bar); | |
1029 | return 0; | |
1030 | } | |
1031 | ||
1032 | void | |
1033 | pci_vfio_ioport_read(struct rte_pci_ioport *p, | |
1034 | void *data, size_t len, off_t offset) | |
1035 | { | |
1036 | const struct rte_intr_handle *intr_handle = &p->dev->intr_handle; | |
1037 | ||
1038 | if (pread64(intr_handle->vfio_dev_fd, data, | |
1039 | len, p->base + offset) <= 0) | |
1040 | RTE_LOG(ERR, EAL, | |
1041 | "Can't read from PCI bar (%" PRIu64 ") : offset (%x)\n", | |
1042 | VFIO_GET_REGION_IDX(p->base), (int)offset); | |
1043 | } | |
1044 | ||
1045 | void | |
1046 | pci_vfio_ioport_write(struct rte_pci_ioport *p, | |
1047 | const void *data, size_t len, off_t offset) | |
1048 | { | |
1049 | const struct rte_intr_handle *intr_handle = &p->dev->intr_handle; | |
1050 | ||
1051 | if (pwrite64(intr_handle->vfio_dev_fd, data, | |
1052 | len, p->base + offset) <= 0) | |
1053 | RTE_LOG(ERR, EAL, | |
1054 | "Can't write to PCI bar (%" PRIu64 ") : offset (%x)\n", | |
1055 | VFIO_GET_REGION_IDX(p->base), (int)offset); | |
1056 | } | |
1057 | ||
1058 | int | |
1059 | pci_vfio_ioport_unmap(struct rte_pci_ioport *p) | |
1060 | { | |
1061 | RTE_SET_USED(p); | |
1062 | return -1; | |
1063 | } | |
1064 | ||
1065 | int | |
1066 | pci_vfio_is_enabled(void) | |
1067 | { | |
1068 | return rte_vfio_is_enabled("vfio_pci"); | |
1069 | } | |
1070 | #endif |