]> git.proxmox.com Git - mirror_qemu.git/blame - util/vfio-helpers.c
block/nvme: Fix nvme_submit_command() on big-endian host
[mirror_qemu.git] / util / vfio-helpers.c
CommitLineData
418026ca
FZ
1/*
2 * VFIO utility
3 *
4 * Copyright 2016 - 2018 Red Hat, Inc.
5 *
6 * Authors:
7 * Fam Zheng <famz@redhat.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2 or later.
10 * See the COPYING file in the top-level directory.
11 */
12
13#include "qemu/osdep.h"
14#include <sys/ioctl.h>
15#include <linux/vfio.h>
16#include "qapi/error.h"
17#include "exec/ramlist.h"
18#include "exec/cpu-common.h"
19#include "trace.h"
418026ca
FZ
20#include "qemu/error-report.h"
21#include "standard-headers/linux/pci_regs.h"
22#include "qemu/event_notifier.h"
23#include "qemu/vfio-helpers.h"
6e8a355d 24#include "qemu/lockable.h"
418026ca
FZ
25#include "trace.h"
26
27#define QEMU_VFIO_DEBUG 0
28
29#define QEMU_VFIO_IOVA_MIN 0x10000ULL
30/* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface,
31 * we can use a runtime limit; alternatively it's also possible to do platform
32 * specific detection by reading sysfs entries. Until then, 39 is a safe bet.
33 **/
34#define QEMU_VFIO_IOVA_MAX (1ULL << 39)
35
36typedef struct {
37 /* Page aligned addr. */
38 void *host;
39 size_t size;
40 uint64_t iova;
41} IOVAMapping;
42
4487d420
EA
43struct IOVARange {
44 uint64_t start;
45 uint64_t end;
46};
47
418026ca
FZ
48struct QEMUVFIOState {
49 QemuMutex lock;
50
51 /* These fields are protected by BQL */
52 int container;
53 int group;
54 int device;
55 RAMBlockNotifier ram_notifier;
56 struct vfio_region_info config_region_info, bar_region_info[6];
4487d420
EA
57 struct IOVARange *usable_iova_ranges;
58 uint8_t nb_iova_ranges;
418026ca
FZ
59
60 /* These fields are protected by @lock */
61 /* VFIO's IO virtual address space is managed by splitting into a few
62 * sections:
63 *
64 * --------------- <= 0
65 * |xxxxxxxxxxxxx|
66 * |-------------| <= QEMU_VFIO_IOVA_MIN
67 * | |
68 * | Fixed |
69 * | |
70 * |-------------| <= low_water_mark
71 * | |
72 * | Free |
73 * | |
74 * |-------------| <= high_water_mark
75 * | |
76 * | Temp |
77 * | |
78 * |-------------| <= QEMU_VFIO_IOVA_MAX
79 * |xxxxxxxxxxxxx|
80 * |xxxxxxxxxxxxx|
81 * ---------------
82 *
83 * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid;
84 *
85 * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of
86 * [QEMU_VFIO_IOVA_MIN, low_water_mark). Once allocated they will not be
87 * reclaimed - low_water_mark never shrinks;
88 *
89 * - IOVAs in range [low_water_mark, high_water_mark) are free;
90 *
91 * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile
92 * mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area
93 * is recycled. The caller should make sure I/O's depending on these
94 * mappings are completed before calling.
95 **/
96 uint64_t low_water_mark;
97 uint64_t high_water_mark;
98 IOVAMapping *mappings;
99 int nr_mappings;
100};
101
102/**
103 * Find group file by PCI device address as specified @device, and return the
104 * path. The returned string is owned by caller and should be g_free'ed later.
105 */
106static char *sysfs_find_group_file(const char *device, Error **errp)
107{
108 char *sysfs_link;
109 char *sysfs_group;
110 char *p;
111 char *path = NULL;
112
113 sysfs_link = g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device);
78d8c99e 114 sysfs_group = g_malloc0(PATH_MAX);
418026ca
FZ
115 if (readlink(sysfs_link, sysfs_group, PATH_MAX - 1) == -1) {
116 error_setg_errno(errp, errno, "Failed to find iommu group sysfs path");
117 goto out;
118 }
119 p = strrchr(sysfs_group, '/');
120 if (!p) {
121 error_setg(errp, "Failed to find iommu group number");
122 goto out;
123 }
124
125 path = g_strdup_printf("/dev/vfio/%s", p + 1);
126out:
127 g_free(sysfs_link);
128 g_free(sysfs_group);
129 return path;
130}
131
132static inline void assert_bar_index_valid(QEMUVFIOState *s, int index)
133{
134 assert(index >= 0 && index < ARRAY_SIZE(s->bar_region_info));
135}
136
137static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp)
138{
139 assert_bar_index_valid(s, index);
140 s->bar_region_info[index] = (struct vfio_region_info) {
141 .index = VFIO_PCI_BAR0_REGION_INDEX + index,
142 .argsz = sizeof(struct vfio_region_info),
143 };
144 if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->bar_region_info[index])) {
145 error_setg_errno(errp, errno, "Failed to get BAR region info");
146 return -errno;
147 }
148
149 return 0;
150}
151
152/**
153 * Map a PCI bar area.
154 */
155void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index,
b02c01a5 156 uint64_t offset, uint64_t size, int prot,
418026ca
FZ
157 Error **errp)
158{
159 void *p;
160 assert_bar_index_valid(s, index);
161 p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset),
b02c01a5 162 prot, MAP_SHARED,
418026ca
FZ
163 s->device, s->bar_region_info[index].offset + offset);
164 if (p == MAP_FAILED) {
165 error_setg_errno(errp, errno, "Failed to map BAR region");
166 p = NULL;
167 }
168 return p;
169}
170
171/**
172 * Unmap a PCI bar area.
173 */
174void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar,
175 uint64_t offset, uint64_t size)
176{
177 if (bar) {
178 munmap(bar, MIN(size, s->bar_region_info[index].size - offset));
179 }
180}
181
182/**
a6da793a 183 * Initialize device IRQ with @irq_type and register an event notifier.
418026ca
FZ
184 */
185int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e,
186 int irq_type, Error **errp)
187{
188 int r;
189 struct vfio_irq_set *irq_set;
190 size_t irq_set_size;
191 struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
192
193 irq_info.index = irq_type;
194 if (ioctl(s->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info)) {
195 error_setg_errno(errp, errno, "Failed to get device interrupt info");
196 return -errno;
197 }
198 if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
199 error_setg(errp, "Device interrupt doesn't support eventfd");
200 return -EINVAL;
201 }
202
203 irq_set_size = sizeof(*irq_set) + sizeof(int);
204 irq_set = g_malloc0(irq_set_size);
205
206 /* Get to a known IRQ state */
207 *irq_set = (struct vfio_irq_set) {
208 .argsz = irq_set_size,
209 .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
210 .index = irq_info.index,
211 .start = 0,
212 .count = 1,
213 };
214
215 *(int *)&irq_set->data = event_notifier_get_fd(e);
216 r = ioctl(s->device, VFIO_DEVICE_SET_IRQS, irq_set);
217 g_free(irq_set);
218 if (r) {
219 error_setg_errno(errp, errno, "Failed to setup device interrupt");
220 return -errno;
221 }
222 return 0;
223}
224
225static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf,
226 int size, int ofs)
227{
228 int ret;
229
230 do {
231 ret = pread(s->device, buf, size, s->config_region_info.offset + ofs);
232 } while (ret == -1 && errno == EINTR);
233 return ret == size ? 0 : -errno;
234}
235
236static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int ofs)
237{
238 int ret;
239
240 do {
241 ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs);
242 } while (ret == -1 && errno == EINTR);
243 return ret == size ? 0 : -errno;
244}
245
4487d420
EA
246static void collect_usable_iova_ranges(QEMUVFIOState *s, void *buf)
247{
248 struct vfio_iommu_type1_info *info = (struct vfio_iommu_type1_info *)buf;
249 struct vfio_info_cap_header *cap = (void *)buf + info->cap_offset;
250 struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range;
251 int i;
252
253 while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) {
254 if (!cap->next) {
255 return;
256 }
257 cap = (struct vfio_info_cap_header *)(buf + cap->next);
258 }
259
260 cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap;
261
262 s->nb_iova_ranges = cap_iova_range->nr_iovas;
263 if (s->nb_iova_ranges > 1) {
264 s->usable_iova_ranges =
265 g_realloc(s->usable_iova_ranges,
266 s->nb_iova_ranges * sizeof(struct IOVARange));
267 }
268
269 for (i = 0; i < s->nb_iova_ranges; i++) {
270 s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start;
271 s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end;
272 }
273}
274
418026ca
FZ
275static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
276 Error **errp)
277{
278 int ret;
279 int i;
280 uint16_t pci_cmd;
281 struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
4487d420
EA
282 struct vfio_iommu_type1_info *iommu_info = NULL;
283 size_t iommu_info_size = sizeof(*iommu_info);
418026ca
FZ
284 struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
285 char *group_file = NULL;
286
4487d420
EA
287 s->usable_iova_ranges = NULL;
288
418026ca
FZ
289 /* Create a new container */
290 s->container = open("/dev/vfio/vfio", O_RDWR);
291
292 if (s->container == -1) {
293 error_setg_errno(errp, errno, "Failed to open /dev/vfio/vfio");
294 return -errno;
295 }
296 if (ioctl(s->container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) {
297 error_setg(errp, "Invalid VFIO version");
298 ret = -EINVAL;
299 goto fail_container;
300 }
301
302 if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
303 error_setg_errno(errp, errno, "VFIO IOMMU check failed");
304 ret = -EINVAL;
305 goto fail_container;
306 }
307
308 /* Open the group */
309 group_file = sysfs_find_group_file(device, errp);
310 if (!group_file) {
311 ret = -EINVAL;
312 goto fail_container;
313 }
314
315 s->group = open(group_file, O_RDWR);
316 if (s->group == -1) {
317 error_setg_errno(errp, errno, "Failed to open VFIO group file: %s",
318 group_file);
319 g_free(group_file);
320 ret = -errno;
321 goto fail_container;
322 }
323 g_free(group_file);
324
325 /* Test the group is viable and available */
326 if (ioctl(s->group, VFIO_GROUP_GET_STATUS, &group_status)) {
327 error_setg_errno(errp, errno, "Failed to get VFIO group status");
328 ret = -errno;
329 goto fail;
330 }
331
332 if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
333 error_setg(errp, "VFIO group is not viable");
334 ret = -EINVAL;
335 goto fail;
336 }
337
338 /* Add the group to the container */
339 if (ioctl(s->group, VFIO_GROUP_SET_CONTAINER, &s->container)) {
340 error_setg_errno(errp, errno, "Failed to add group to VFIO container");
341 ret = -errno;
342 goto fail;
343 }
344
345 /* Enable the IOMMU model we want */
346 if (ioctl(s->container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) {
347 error_setg_errno(errp, errno, "Failed to set VFIO IOMMU type");
348 ret = -errno;
349 goto fail;
350 }
351
4487d420
EA
352 iommu_info = g_malloc0(iommu_info_size);
353 iommu_info->argsz = iommu_info_size;
354
418026ca 355 /* Get additional IOMMU info */
4487d420 356 if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
418026ca
FZ
357 error_setg_errno(errp, errno, "Failed to get IOMMU info");
358 ret = -errno;
359 goto fail;
360 }
361
4487d420
EA
362 /*
363 * if the kernel does not report usable IOVA regions, choose
364 * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region
365 */
366 s->nb_iova_ranges = 1;
367 s->usable_iova_ranges = g_new0(struct IOVARange, 1);
368 s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN;
369 s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1;
370
371 if (iommu_info->argsz > iommu_info_size) {
372 iommu_info_size = iommu_info->argsz;
373 iommu_info = g_realloc(iommu_info, iommu_info_size);
374 if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
375 ret = -errno;
376 goto fail;
377 }
378 collect_usable_iova_ranges(s, iommu_info);
379 }
380
418026ca
FZ
381 s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device);
382
383 if (s->device < 0) {
384 error_setg_errno(errp, errno, "Failed to get device fd");
385 ret = -errno;
386 goto fail;
387 }
388
389 /* Test and setup the device */
390 if (ioctl(s->device, VFIO_DEVICE_GET_INFO, &device_info)) {
391 error_setg_errno(errp, errno, "Failed to get device info");
392 ret = -errno;
393 goto fail;
394 }
395
396 if (device_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
397 error_setg(errp, "Invalid device regions");
398 ret = -EINVAL;
399 goto fail;
400 }
401
402 s->config_region_info = (struct vfio_region_info) {
403 .index = VFIO_PCI_CONFIG_REGION_INDEX,
404 .argsz = sizeof(struct vfio_region_info),
405 };
406 if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->config_region_info)) {
407 error_setg_errno(errp, errno, "Failed to get config region info");
408 ret = -errno;
409 goto fail;
410 }
411
9e722ebc 412 for (i = 0; i < ARRAY_SIZE(s->bar_region_info); i++) {
418026ca
FZ
413 ret = qemu_vfio_pci_init_bar(s, i, errp);
414 if (ret) {
415 goto fail;
416 }
417 }
418
419 /* Enable bus master */
420 ret = qemu_vfio_pci_read_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND);
421 if (ret) {
422 goto fail;
423 }
424 pci_cmd |= PCI_COMMAND_MASTER;
425 ret = qemu_vfio_pci_write_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND);
426 if (ret) {
427 goto fail;
428 }
4487d420 429 g_free(iommu_info);
418026ca
FZ
430 return 0;
431fail:
4487d420
EA
432 g_free(s->usable_iova_ranges);
433 s->usable_iova_ranges = NULL;
434 s->nb_iova_ranges = 0;
435 g_free(iommu_info);
418026ca
FZ
436 close(s->group);
437fail_container:
438 close(s->container);
439 return ret;
440}
441
442static void qemu_vfio_ram_block_added(RAMBlockNotifier *n,
443 void *host, size_t size)
444{
445 QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
446 trace_qemu_vfio_ram_block_added(s, host, size);
447 qemu_vfio_dma_map(s, host, size, false, NULL);
448}
449
450static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n,
451 void *host, size_t size)
452{
453 QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
454 if (host) {
455 trace_qemu_vfio_ram_block_removed(s, host, size);
456 qemu_vfio_dma_unmap(s, host);
457 }
458}
459
754cb9c0 460static int qemu_vfio_init_ramblock(RAMBlock *rb, void *opaque)
418026ca 461{
754cb9c0
YK
462 void *host_addr = qemu_ram_get_host_addr(rb);
463 ram_addr_t length = qemu_ram_get_used_length(rb);
418026ca
FZ
464 int ret;
465 QEMUVFIOState *s = opaque;
466
467 if (!host_addr) {
468 return 0;
469 }
470 ret = qemu_vfio_dma_map(s, host_addr, length, false, NULL);
471 if (ret) {
472 fprintf(stderr, "qemu_vfio_init_ramblock: failed %p %" PRId64 "\n",
473 host_addr, (uint64_t)length);
474 }
475 return 0;
476}
477
478static void qemu_vfio_open_common(QEMUVFIOState *s)
479{
549b50a3 480 qemu_mutex_init(&s->lock);
418026ca
FZ
481 s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added;
482 s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed;
483 ram_block_notifier_add(&s->ram_notifier);
484 s->low_water_mark = QEMU_VFIO_IOVA_MIN;
485 s->high_water_mark = QEMU_VFIO_IOVA_MAX;
486 qemu_ram_foreach_block(qemu_vfio_init_ramblock, s);
418026ca
FZ
487}
488
489/**
490 * Open a PCI device, e.g. "0000:00:01.0".
491 */
492QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp)
493{
494 int r;
495 QEMUVFIOState *s = g_new0(QEMUVFIOState, 1);
496
497 r = qemu_vfio_init_pci(s, device, errp);
498 if (r) {
499 g_free(s);
500 return NULL;
501 }
502 qemu_vfio_open_common(s);
503 return s;
504}
505
506static void qemu_vfio_dump_mapping(IOVAMapping *m)
507{
508 if (QEMU_VFIO_DEBUG) {
509 printf(" vfio mapping %p %" PRIx64 " to %" PRIx64 "\n", m->host,
510 (uint64_t)m->size, (uint64_t)m->iova);
511 }
512}
513
514static void qemu_vfio_dump_mappings(QEMUVFIOState *s)
515{
516 int i;
517
518 if (QEMU_VFIO_DEBUG) {
519 printf("vfio mappings\n");
520 for (i = 0; i < s->nr_mappings; ++i) {
521 qemu_vfio_dump_mapping(&s->mappings[i]);
522 }
523 }
524}
525
526/**
527 * Find the mapping entry that contains [host, host + size) and set @index to
528 * the position. If no entry contains it, @index is the position _after_ which
529 * to insert the new mapping. IOW, it is the index of the largest element that
530 * is smaller than @host, or -1 if no entry is.
531 */
532static IOVAMapping *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host,
533 int *index)
534{
535 IOVAMapping *p = s->mappings;
536 IOVAMapping *q = p ? p + s->nr_mappings - 1 : NULL;
537 IOVAMapping *mid;
538 trace_qemu_vfio_find_mapping(s, host);
539 if (!p) {
540 *index = -1;
541 return NULL;
542 }
543 while (true) {
544 mid = p + (q - p) / 2;
545 if (mid == p) {
546 break;
547 }
548 if (mid->host > host) {
549 q = mid;
550 } else if (mid->host < host) {
551 p = mid;
552 } else {
553 break;
554 }
555 }
556 if (mid->host > host) {
557 mid--;
558 } else if (mid < &s->mappings[s->nr_mappings - 1]
559 && (mid + 1)->host <= host) {
560 mid++;
561 }
562 *index = mid - &s->mappings[0];
563 if (mid >= &s->mappings[0] &&
564 mid->host <= host && mid->host + mid->size > host) {
565 assert(mid < &s->mappings[s->nr_mappings]);
566 return mid;
567 }
568 /* At this point *index + 1 is the right position to insert the new
569 * mapping.*/
570 return NULL;
571}
572
573/**
a6da793a 574 * Allocate IOVA and create a new mapping record and insert it in @s.
418026ca
FZ
575 */
576static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s,
577 void *host, size_t size,
578 int index, uint64_t iova)
579{
580 int shift;
581 IOVAMapping m = {.host = host, .size = size, .iova = iova};
582 IOVAMapping *insert;
583
038adc2f
WY
584 assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size));
585 assert(QEMU_IS_ALIGNED(s->low_water_mark, qemu_real_host_page_size));
586 assert(QEMU_IS_ALIGNED(s->high_water_mark, qemu_real_host_page_size));
418026ca
FZ
587 trace_qemu_vfio_new_mapping(s, host, size, index, iova);
588
589 assert(index >= 0);
590 s->nr_mappings++;
d29eb678 591 s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings);
418026ca
FZ
592 insert = &s->mappings[index];
593 shift = s->nr_mappings - index - 1;
594 if (shift) {
595 memmove(insert + 1, insert, shift * sizeof(s->mappings[0]));
596 }
597 *insert = m;
598 return insert;
599}
600
601/* Do the DMA mapping with VFIO. */
602static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size,
603 uint64_t iova)
604{
605 struct vfio_iommu_type1_dma_map dma_map = {
606 .argsz = sizeof(dma_map),
607 .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
608 .iova = iova,
609 .vaddr = (uintptr_t)host,
610 .size = size,
611 };
612 trace_qemu_vfio_do_mapping(s, host, size, iova);
613
614 if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
b09d51c9 615 error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
418026ca
FZ
616 return -errno;
617 }
618 return 0;
619}
620
621/**
622 * Undo the DMA mapping from @s with VFIO, and remove from mapping list.
623 */
624static void qemu_vfio_undo_mapping(QEMUVFIOState *s, IOVAMapping *mapping,
625 Error **errp)
626{
627 int index;
628 struct vfio_iommu_type1_dma_unmap unmap = {
629 .argsz = sizeof(unmap),
630 .flags = 0,
631 .iova = mapping->iova,
632 .size = mapping->size,
633 };
634
635 index = mapping - s->mappings;
636 assert(mapping->size > 0);
038adc2f 637 assert(QEMU_IS_ALIGNED(mapping->size, qemu_real_host_page_size));
418026ca
FZ
638 assert(index >= 0 && index < s->nr_mappings);
639 if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
b09d51c9 640 error_setg_errno(errp, errno, "VFIO_UNMAP_DMA failed");
418026ca
FZ
641 }
642 memmove(mapping, &s->mappings[index + 1],
643 sizeof(s->mappings[0]) * (s->nr_mappings - index - 1));
644 s->nr_mappings--;
d29eb678 645 s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings);
418026ca
FZ
646}
647
648/* Check if the mapping list is (ascending) ordered. */
649static bool qemu_vfio_verify_mappings(QEMUVFIOState *s)
650{
651 int i;
652 if (QEMU_VFIO_DEBUG) {
653 for (i = 0; i < s->nr_mappings - 1; ++i) {
654 if (!(s->mappings[i].host < s->mappings[i + 1].host)) {
655 fprintf(stderr, "item %d not sorted!\n", i);
656 qemu_vfio_dump_mappings(s);
657 return false;
658 }
659 if (!(s->mappings[i].host + s->mappings[i].size <=
660 s->mappings[i + 1].host)) {
661 fprintf(stderr, "item %d overlap with next!\n", i);
662 qemu_vfio_dump_mappings(s);
663 return false;
664 }
665 }
666 }
667 return true;
668}
669
9ab57411
EA
670static int
671qemu_vfio_find_fixed_iova(QEMUVFIOState *s, size_t size, uint64_t *iova)
672{
673 int i;
674
675 for (i = 0; i < s->nb_iova_ranges; i++) {
676 if (s->usable_iova_ranges[i].end < s->low_water_mark) {
677 continue;
678 }
679 s->low_water_mark =
680 MAX(s->low_water_mark, s->usable_iova_ranges[i].start);
681
682 if (s->usable_iova_ranges[i].end - s->low_water_mark + 1 >= size ||
683 s->usable_iova_ranges[i].end - s->low_water_mark + 1 == 0) {
684 *iova = s->low_water_mark;
685 s->low_water_mark += size;
686 return 0;
687 }
688 }
689 return -ENOMEM;
690}
691
692static int
693qemu_vfio_find_temp_iova(QEMUVFIOState *s, size_t size, uint64_t *iova)
694{
695 int i;
696
697 for (i = s->nb_iova_ranges - 1; i >= 0; i--) {
698 if (s->usable_iova_ranges[i].start > s->high_water_mark) {
699 continue;
700 }
701 s->high_water_mark =
702 MIN(s->high_water_mark, s->usable_iova_ranges[i].end + 1);
703
704 if (s->high_water_mark - s->usable_iova_ranges[i].start + 1 >= size ||
705 s->high_water_mark - s->usable_iova_ranges[i].start + 1 == 0) {
706 *iova = s->high_water_mark - size;
707 s->high_water_mark = *iova;
708 return 0;
709 }
710 }
711 return -ENOMEM;
712}
713
418026ca
FZ
714/* Map [host, host + size) area into a contiguous IOVA address space, and store
715 * the result in @iova if not NULL. The caller need to make sure the area is
716 * aligned to page size, and mustn't overlap with existing mapping areas (split
717 * mapping status within this area is not allowed).
718 */
719int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
720 bool temporary, uint64_t *iova)
721{
722 int ret = 0;
723 int index;
724 IOVAMapping *mapping;
725 uint64_t iova0;
726
038adc2f
WY
727 assert(QEMU_PTR_IS_ALIGNED(host, qemu_real_host_page_size));
728 assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size));
418026ca
FZ
729 trace_qemu_vfio_dma_map(s, host, size, temporary, iova);
730 qemu_mutex_lock(&s->lock);
731 mapping = qemu_vfio_find_mapping(s, host, &index);
732 if (mapping) {
733 iova0 = mapping->iova + ((uint8_t *)host - (uint8_t *)mapping->host);
734 } else {
735 if (s->high_water_mark - s->low_water_mark + 1 < size) {
736 ret = -ENOMEM;
737 goto out;
738 }
739 if (!temporary) {
9ab57411
EA
740 if (qemu_vfio_find_fixed_iova(s, size, &iova0)) {
741 ret = -ENOMEM;
742 goto out;
743 }
744
418026ca
FZ
745 mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0);
746 if (!mapping) {
747 ret = -ENOMEM;
748 goto out;
749 }
750 assert(qemu_vfio_verify_mappings(s));
751 ret = qemu_vfio_do_mapping(s, host, size, iova0);
752 if (ret) {
753 qemu_vfio_undo_mapping(s, mapping, NULL);
754 goto out;
755 }
418026ca
FZ
756 qemu_vfio_dump_mappings(s);
757 } else {
9ab57411
EA
758 if (qemu_vfio_find_temp_iova(s, size, &iova0)) {
759 ret = -ENOMEM;
760 goto out;
761 }
418026ca
FZ
762 ret = qemu_vfio_do_mapping(s, host, size, iova0);
763 if (ret) {
764 goto out;
765 }
418026ca
FZ
766 }
767 }
768 if (iova) {
769 *iova = iova0;
770 }
771out:
772 qemu_mutex_unlock(&s->lock);
773 return ret;
774}
775
776/* Reset the high watermark and free all "temporary" mappings. */
777int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s)
778{
779 struct vfio_iommu_type1_dma_unmap unmap = {
780 .argsz = sizeof(unmap),
781 .flags = 0,
782 .iova = s->high_water_mark,
783 .size = QEMU_VFIO_IOVA_MAX - s->high_water_mark,
784 };
785 trace_qemu_vfio_dma_reset_temporary(s);
6e8a355d 786 QEMU_LOCK_GUARD(&s->lock);
418026ca 787 if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
b09d51c9 788 error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
418026ca
FZ
789 return -errno;
790 }
791 s->high_water_mark = QEMU_VFIO_IOVA_MAX;
418026ca
FZ
792 return 0;
793}
794
795/* Unmapping the whole area that was previously mapped with
796 * qemu_vfio_dma_map(). */
797void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host)
798{
799 int index = 0;
800 IOVAMapping *m;
801
802 if (!host) {
803 return;
804 }
805
806 trace_qemu_vfio_dma_unmap(s, host);
807 qemu_mutex_lock(&s->lock);
808 m = qemu_vfio_find_mapping(s, host, &index);
809 if (!m) {
810 goto out;
811 }
812 qemu_vfio_undo_mapping(s, m, NULL);
813out:
814 qemu_mutex_unlock(&s->lock);
815}
816
817static void qemu_vfio_reset(QEMUVFIOState *s)
818{
819 ioctl(s->device, VFIO_DEVICE_RESET);
820}
821
822/* Close and free the VFIO resources. */
823void qemu_vfio_close(QEMUVFIOState *s)
824{
825 int i;
826
827 if (!s) {
828 return;
829 }
830 for (i = 0; i < s->nr_mappings; ++i) {
831 qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
832 }
833 ram_block_notifier_remove(&s->ram_notifier);
4487d420
EA
834 g_free(s->usable_iova_ranges);
835 s->nb_iova_ranges = 0;
418026ca
FZ
836 qemu_vfio_reset(s);
837 close(s->device);
838 close(s->group);
839 close(s->container);
840}