]>
Commit | Line | Data |
---|---|---|
418026ca FZ |
1 | /* |
2 | * VFIO utility | |
3 | * | |
4 | * Copyright 2016 - 2018 Red Hat, Inc. | |
5 | * | |
6 | * Authors: | |
7 | * Fam Zheng <famz@redhat.com> | |
8 | * | |
9 | * This work is licensed under the terms of the GNU GPL, version 2 or later. | |
10 | * See the COPYING file in the top-level directory. | |
11 | */ | |
12 | ||
13 | #include "qemu/osdep.h" | |
14 | #include <sys/ioctl.h> | |
15 | #include <linux/vfio.h> | |
16 | #include "qapi/error.h" | |
17 | #include "exec/ramlist.h" | |
18 | #include "exec/cpu-common.h" | |
19 | #include "trace.h" | |
20 | #include "qemu/queue.h" | |
21 | #include "qemu/error-report.h" | |
22 | #include "standard-headers/linux/pci_regs.h" | |
23 | #include "qemu/event_notifier.h" | |
24 | #include "qemu/vfio-helpers.h" | |
25 | #include "trace.h" | |
26 | ||
27 | #define QEMU_VFIO_DEBUG 0 | |
28 | ||
29 | #define QEMU_VFIO_IOVA_MIN 0x10000ULL | |
30 | /* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface, | |
31 | * we can use a runtime limit; alternatively it's also possible to do platform | |
32 | * specific detection by reading sysfs entries. Until then, 39 is a safe bet. | |
33 | **/ | |
34 | #define QEMU_VFIO_IOVA_MAX (1ULL << 39) | |
35 | ||
36 | typedef struct { | |
37 | /* Page aligned addr. */ | |
38 | void *host; | |
39 | size_t size; | |
40 | uint64_t iova; | |
41 | } IOVAMapping; | |
42 | ||
43 | struct QEMUVFIOState { | |
44 | QemuMutex lock; | |
45 | ||
46 | /* These fields are protected by BQL */ | |
47 | int container; | |
48 | int group; | |
49 | int device; | |
50 | RAMBlockNotifier ram_notifier; | |
51 | struct vfio_region_info config_region_info, bar_region_info[6]; | |
52 | ||
53 | /* These fields are protected by @lock */ | |
54 | /* VFIO's IO virtual address space is managed by splitting into a few | |
55 | * sections: | |
56 | * | |
57 | * --------------- <= 0 | |
58 | * |xxxxxxxxxxxxx| | |
59 | * |-------------| <= QEMU_VFIO_IOVA_MIN | |
60 | * | | | |
61 | * | Fixed | | |
62 | * | | | |
63 | * |-------------| <= low_water_mark | |
64 | * | | | |
65 | * | Free | | |
66 | * | | | |
67 | * |-------------| <= high_water_mark | |
68 | * | | | |
69 | * | Temp | | |
70 | * | | | |
71 | * |-------------| <= QEMU_VFIO_IOVA_MAX | |
72 | * |xxxxxxxxxxxxx| | |
73 | * |xxxxxxxxxxxxx| | |
74 | * --------------- | |
75 | * | |
76 | * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid; | |
77 | * | |
78 | * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of | |
79 | * [QEMU_VFIO_IOVA_MIN, low_water_mark). Once allocated they will not be | |
80 | * reclaimed - low_water_mark never shrinks; | |
81 | * | |
82 | * - IOVAs in range [low_water_mark, high_water_mark) are free; | |
83 | * | |
84 | * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile | |
85 | * mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area | |
86 | * is recycled. The caller should make sure I/O's depending on these | |
87 | * mappings are completed before calling. | |
88 | **/ | |
89 | uint64_t low_water_mark; | |
90 | uint64_t high_water_mark; | |
91 | IOVAMapping *mappings; | |
92 | int nr_mappings; | |
93 | }; | |
94 | ||
95 | /** | |
96 | * Find group file by PCI device address as specified @device, and return the | |
97 | * path. The returned string is owned by caller and should be g_free'ed later. | |
98 | */ | |
99 | static char *sysfs_find_group_file(const char *device, Error **errp) | |
100 | { | |
101 | char *sysfs_link; | |
102 | char *sysfs_group; | |
103 | char *p; | |
104 | char *path = NULL; | |
105 | ||
106 | sysfs_link = g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device); | |
107 | sysfs_group = g_malloc(PATH_MAX); | |
108 | if (readlink(sysfs_link, sysfs_group, PATH_MAX - 1) == -1) { | |
109 | error_setg_errno(errp, errno, "Failed to find iommu group sysfs path"); | |
110 | goto out; | |
111 | } | |
112 | p = strrchr(sysfs_group, '/'); | |
113 | if (!p) { | |
114 | error_setg(errp, "Failed to find iommu group number"); | |
115 | goto out; | |
116 | } | |
117 | ||
118 | path = g_strdup_printf("/dev/vfio/%s", p + 1); | |
119 | out: | |
120 | g_free(sysfs_link); | |
121 | g_free(sysfs_group); | |
122 | return path; | |
123 | } | |
124 | ||
125 | static inline void assert_bar_index_valid(QEMUVFIOState *s, int index) | |
126 | { | |
127 | assert(index >= 0 && index < ARRAY_SIZE(s->bar_region_info)); | |
128 | } | |
129 | ||
130 | static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp) | |
131 | { | |
132 | assert_bar_index_valid(s, index); | |
133 | s->bar_region_info[index] = (struct vfio_region_info) { | |
134 | .index = VFIO_PCI_BAR0_REGION_INDEX + index, | |
135 | .argsz = sizeof(struct vfio_region_info), | |
136 | }; | |
137 | if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->bar_region_info[index])) { | |
138 | error_setg_errno(errp, errno, "Failed to get BAR region info"); | |
139 | return -errno; | |
140 | } | |
141 | ||
142 | return 0; | |
143 | } | |
144 | ||
145 | /** | |
146 | * Map a PCI bar area. | |
147 | */ | |
148 | void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index, | |
149 | uint64_t offset, uint64_t size, | |
150 | Error **errp) | |
151 | { | |
152 | void *p; | |
153 | assert_bar_index_valid(s, index); | |
154 | p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset), | |
155 | PROT_READ | PROT_WRITE, MAP_SHARED, | |
156 | s->device, s->bar_region_info[index].offset + offset); | |
157 | if (p == MAP_FAILED) { | |
158 | error_setg_errno(errp, errno, "Failed to map BAR region"); | |
159 | p = NULL; | |
160 | } | |
161 | return p; | |
162 | } | |
163 | ||
164 | /** | |
165 | * Unmap a PCI bar area. | |
166 | */ | |
167 | void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar, | |
168 | uint64_t offset, uint64_t size) | |
169 | { | |
170 | if (bar) { | |
171 | munmap(bar, MIN(size, s->bar_region_info[index].size - offset)); | |
172 | } | |
173 | } | |
174 | ||
175 | /** | |
176 | * Initialize device IRQ with @irq_type and and register an event notifier. | |
177 | */ | |
178 | int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e, | |
179 | int irq_type, Error **errp) | |
180 | { | |
181 | int r; | |
182 | struct vfio_irq_set *irq_set; | |
183 | size_t irq_set_size; | |
184 | struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; | |
185 | ||
186 | irq_info.index = irq_type; | |
187 | if (ioctl(s->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info)) { | |
188 | error_setg_errno(errp, errno, "Failed to get device interrupt info"); | |
189 | return -errno; | |
190 | } | |
191 | if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) { | |
192 | error_setg(errp, "Device interrupt doesn't support eventfd"); | |
193 | return -EINVAL; | |
194 | } | |
195 | ||
196 | irq_set_size = sizeof(*irq_set) + sizeof(int); | |
197 | irq_set = g_malloc0(irq_set_size); | |
198 | ||
199 | /* Get to a known IRQ state */ | |
200 | *irq_set = (struct vfio_irq_set) { | |
201 | .argsz = irq_set_size, | |
202 | .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER, | |
203 | .index = irq_info.index, | |
204 | .start = 0, | |
205 | .count = 1, | |
206 | }; | |
207 | ||
208 | *(int *)&irq_set->data = event_notifier_get_fd(e); | |
209 | r = ioctl(s->device, VFIO_DEVICE_SET_IRQS, irq_set); | |
210 | g_free(irq_set); | |
211 | if (r) { | |
212 | error_setg_errno(errp, errno, "Failed to setup device interrupt"); | |
213 | return -errno; | |
214 | } | |
215 | return 0; | |
216 | } | |
217 | ||
218 | static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf, | |
219 | int size, int ofs) | |
220 | { | |
221 | int ret; | |
222 | ||
223 | do { | |
224 | ret = pread(s->device, buf, size, s->config_region_info.offset + ofs); | |
225 | } while (ret == -1 && errno == EINTR); | |
226 | return ret == size ? 0 : -errno; | |
227 | } | |
228 | ||
229 | static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int ofs) | |
230 | { | |
231 | int ret; | |
232 | ||
233 | do { | |
234 | ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs); | |
235 | } while (ret == -1 && errno == EINTR); | |
236 | return ret == size ? 0 : -errno; | |
237 | } | |
238 | ||
239 | static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device, | |
240 | Error **errp) | |
241 | { | |
242 | int ret; | |
243 | int i; | |
244 | uint16_t pci_cmd; | |
245 | struct vfio_group_status group_status = { .argsz = sizeof(group_status) }; | |
246 | struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) }; | |
247 | struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; | |
248 | char *group_file = NULL; | |
249 | ||
250 | /* Create a new container */ | |
251 | s->container = open("/dev/vfio/vfio", O_RDWR); | |
252 | ||
253 | if (s->container == -1) { | |
254 | error_setg_errno(errp, errno, "Failed to open /dev/vfio/vfio"); | |
255 | return -errno; | |
256 | } | |
257 | if (ioctl(s->container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) { | |
258 | error_setg(errp, "Invalid VFIO version"); | |
259 | ret = -EINVAL; | |
260 | goto fail_container; | |
261 | } | |
262 | ||
263 | if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) { | |
264 | error_setg_errno(errp, errno, "VFIO IOMMU check failed"); | |
265 | ret = -EINVAL; | |
266 | goto fail_container; | |
267 | } | |
268 | ||
269 | /* Open the group */ | |
270 | group_file = sysfs_find_group_file(device, errp); | |
271 | if (!group_file) { | |
272 | ret = -EINVAL; | |
273 | goto fail_container; | |
274 | } | |
275 | ||
276 | s->group = open(group_file, O_RDWR); | |
277 | if (s->group == -1) { | |
278 | error_setg_errno(errp, errno, "Failed to open VFIO group file: %s", | |
279 | group_file); | |
280 | g_free(group_file); | |
281 | ret = -errno; | |
282 | goto fail_container; | |
283 | } | |
284 | g_free(group_file); | |
285 | ||
286 | /* Test the group is viable and available */ | |
287 | if (ioctl(s->group, VFIO_GROUP_GET_STATUS, &group_status)) { | |
288 | error_setg_errno(errp, errno, "Failed to get VFIO group status"); | |
289 | ret = -errno; | |
290 | goto fail; | |
291 | } | |
292 | ||
293 | if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { | |
294 | error_setg(errp, "VFIO group is not viable"); | |
295 | ret = -EINVAL; | |
296 | goto fail; | |
297 | } | |
298 | ||
299 | /* Add the group to the container */ | |
300 | if (ioctl(s->group, VFIO_GROUP_SET_CONTAINER, &s->container)) { | |
301 | error_setg_errno(errp, errno, "Failed to add group to VFIO container"); | |
302 | ret = -errno; | |
303 | goto fail; | |
304 | } | |
305 | ||
306 | /* Enable the IOMMU model we want */ | |
307 | if (ioctl(s->container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) { | |
308 | error_setg_errno(errp, errno, "Failed to set VFIO IOMMU type"); | |
309 | ret = -errno; | |
310 | goto fail; | |
311 | } | |
312 | ||
313 | /* Get additional IOMMU info */ | |
314 | if (ioctl(s->container, VFIO_IOMMU_GET_INFO, &iommu_info)) { | |
315 | error_setg_errno(errp, errno, "Failed to get IOMMU info"); | |
316 | ret = -errno; | |
317 | goto fail; | |
318 | } | |
319 | ||
320 | s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device); | |
321 | ||
322 | if (s->device < 0) { | |
323 | error_setg_errno(errp, errno, "Failed to get device fd"); | |
324 | ret = -errno; | |
325 | goto fail; | |
326 | } | |
327 | ||
328 | /* Test and setup the device */ | |
329 | if (ioctl(s->device, VFIO_DEVICE_GET_INFO, &device_info)) { | |
330 | error_setg_errno(errp, errno, "Failed to get device info"); | |
331 | ret = -errno; | |
332 | goto fail; | |
333 | } | |
334 | ||
335 | if (device_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) { | |
336 | error_setg(errp, "Invalid device regions"); | |
337 | ret = -EINVAL; | |
338 | goto fail; | |
339 | } | |
340 | ||
341 | s->config_region_info = (struct vfio_region_info) { | |
342 | .index = VFIO_PCI_CONFIG_REGION_INDEX, | |
343 | .argsz = sizeof(struct vfio_region_info), | |
344 | }; | |
345 | if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->config_region_info)) { | |
346 | error_setg_errno(errp, errno, "Failed to get config region info"); | |
347 | ret = -errno; | |
348 | goto fail; | |
349 | } | |
350 | ||
351 | for (i = 0; i < 6; i++) { | |
352 | ret = qemu_vfio_pci_init_bar(s, i, errp); | |
353 | if (ret) { | |
354 | goto fail; | |
355 | } | |
356 | } | |
357 | ||
358 | /* Enable bus master */ | |
359 | ret = qemu_vfio_pci_read_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND); | |
360 | if (ret) { | |
361 | goto fail; | |
362 | } | |
363 | pci_cmd |= PCI_COMMAND_MASTER; | |
364 | ret = qemu_vfio_pci_write_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND); | |
365 | if (ret) { | |
366 | goto fail; | |
367 | } | |
368 | return 0; | |
369 | fail: | |
370 | close(s->group); | |
371 | fail_container: | |
372 | close(s->container); | |
373 | return ret; | |
374 | } | |
375 | ||
376 | static void qemu_vfio_ram_block_added(RAMBlockNotifier *n, | |
377 | void *host, size_t size) | |
378 | { | |
379 | QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); | |
380 | trace_qemu_vfio_ram_block_added(s, host, size); | |
381 | qemu_vfio_dma_map(s, host, size, false, NULL); | |
382 | } | |
383 | ||
384 | static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n, | |
385 | void *host, size_t size) | |
386 | { | |
387 | QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); | |
388 | if (host) { | |
389 | trace_qemu_vfio_ram_block_removed(s, host, size); | |
390 | qemu_vfio_dma_unmap(s, host); | |
391 | } | |
392 | } | |
393 | ||
394 | static int qemu_vfio_init_ramblock(const char *block_name, void *host_addr, | |
395 | ram_addr_t offset, ram_addr_t length, | |
396 | void *opaque) | |
397 | { | |
398 | int ret; | |
399 | QEMUVFIOState *s = opaque; | |
400 | ||
401 | if (!host_addr) { | |
402 | return 0; | |
403 | } | |
404 | ret = qemu_vfio_dma_map(s, host_addr, length, false, NULL); | |
405 | if (ret) { | |
406 | fprintf(stderr, "qemu_vfio_init_ramblock: failed %p %" PRId64 "\n", | |
407 | host_addr, (uint64_t)length); | |
408 | } | |
409 | return 0; | |
410 | } | |
411 | ||
412 | static void qemu_vfio_open_common(QEMUVFIOState *s) | |
413 | { | |
414 | s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added; | |
415 | s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed; | |
416 | ram_block_notifier_add(&s->ram_notifier); | |
417 | s->low_water_mark = QEMU_VFIO_IOVA_MIN; | |
418 | s->high_water_mark = QEMU_VFIO_IOVA_MAX; | |
419 | qemu_ram_foreach_block(qemu_vfio_init_ramblock, s); | |
420 | qemu_mutex_init(&s->lock); | |
421 | } | |
422 | ||
423 | /** | |
424 | * Open a PCI device, e.g. "0000:00:01.0". | |
425 | */ | |
426 | QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp) | |
427 | { | |
428 | int r; | |
429 | QEMUVFIOState *s = g_new0(QEMUVFIOState, 1); | |
430 | ||
431 | r = qemu_vfio_init_pci(s, device, errp); | |
432 | if (r) { | |
433 | g_free(s); | |
434 | return NULL; | |
435 | } | |
436 | qemu_vfio_open_common(s); | |
437 | return s; | |
438 | } | |
439 | ||
440 | static void qemu_vfio_dump_mapping(IOVAMapping *m) | |
441 | { | |
442 | if (QEMU_VFIO_DEBUG) { | |
443 | printf(" vfio mapping %p %" PRIx64 " to %" PRIx64 "\n", m->host, | |
444 | (uint64_t)m->size, (uint64_t)m->iova); | |
445 | } | |
446 | } | |
447 | ||
448 | static void qemu_vfio_dump_mappings(QEMUVFIOState *s) | |
449 | { | |
450 | int i; | |
451 | ||
452 | if (QEMU_VFIO_DEBUG) { | |
453 | printf("vfio mappings\n"); | |
454 | for (i = 0; i < s->nr_mappings; ++i) { | |
455 | qemu_vfio_dump_mapping(&s->mappings[i]); | |
456 | } | |
457 | } | |
458 | } | |
459 | ||
460 | /** | |
461 | * Find the mapping entry that contains [host, host + size) and set @index to | |
462 | * the position. If no entry contains it, @index is the position _after_ which | |
463 | * to insert the new mapping. IOW, it is the index of the largest element that | |
464 | * is smaller than @host, or -1 if no entry is. | |
465 | */ | |
466 | static IOVAMapping *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host, | |
467 | int *index) | |
468 | { | |
469 | IOVAMapping *p = s->mappings; | |
470 | IOVAMapping *q = p ? p + s->nr_mappings - 1 : NULL; | |
471 | IOVAMapping *mid; | |
472 | trace_qemu_vfio_find_mapping(s, host); | |
473 | if (!p) { | |
474 | *index = -1; | |
475 | return NULL; | |
476 | } | |
477 | while (true) { | |
478 | mid = p + (q - p) / 2; | |
479 | if (mid == p) { | |
480 | break; | |
481 | } | |
482 | if (mid->host > host) { | |
483 | q = mid; | |
484 | } else if (mid->host < host) { | |
485 | p = mid; | |
486 | } else { | |
487 | break; | |
488 | } | |
489 | } | |
490 | if (mid->host > host) { | |
491 | mid--; | |
492 | } else if (mid < &s->mappings[s->nr_mappings - 1] | |
493 | && (mid + 1)->host <= host) { | |
494 | mid++; | |
495 | } | |
496 | *index = mid - &s->mappings[0]; | |
497 | if (mid >= &s->mappings[0] && | |
498 | mid->host <= host && mid->host + mid->size > host) { | |
499 | assert(mid < &s->mappings[s->nr_mappings]); | |
500 | return mid; | |
501 | } | |
502 | /* At this point *index + 1 is the right position to insert the new | |
503 | * mapping.*/ | |
504 | return NULL; | |
505 | } | |
506 | ||
507 | /** | |
508 | * Allocate IOVA and and create a new mapping record and insert it in @s. | |
509 | */ | |
510 | static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s, | |
511 | void *host, size_t size, | |
512 | int index, uint64_t iova) | |
513 | { | |
514 | int shift; | |
515 | IOVAMapping m = {.host = host, .size = size, .iova = iova}; | |
516 | IOVAMapping *insert; | |
517 | ||
518 | assert(QEMU_IS_ALIGNED(size, getpagesize())); | |
519 | assert(QEMU_IS_ALIGNED(s->low_water_mark, getpagesize())); | |
520 | assert(QEMU_IS_ALIGNED(s->high_water_mark, getpagesize())); | |
521 | trace_qemu_vfio_new_mapping(s, host, size, index, iova); | |
522 | ||
523 | assert(index >= 0); | |
524 | s->nr_mappings++; | |
525 | s->mappings = g_realloc_n(s->mappings, sizeof(s->mappings[0]), | |
526 | s->nr_mappings); | |
527 | insert = &s->mappings[index]; | |
528 | shift = s->nr_mappings - index - 1; | |
529 | if (shift) { | |
530 | memmove(insert + 1, insert, shift * sizeof(s->mappings[0])); | |
531 | } | |
532 | *insert = m; | |
533 | return insert; | |
534 | } | |
535 | ||
536 | /* Do the DMA mapping with VFIO. */ | |
537 | static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size, | |
538 | uint64_t iova) | |
539 | { | |
540 | struct vfio_iommu_type1_dma_map dma_map = { | |
541 | .argsz = sizeof(dma_map), | |
542 | .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, | |
543 | .iova = iova, | |
544 | .vaddr = (uintptr_t)host, | |
545 | .size = size, | |
546 | }; | |
547 | trace_qemu_vfio_do_mapping(s, host, size, iova); | |
548 | ||
549 | if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) { | |
550 | error_report("VFIO_MAP_DMA: %d", -errno); | |
551 | return -errno; | |
552 | } | |
553 | return 0; | |
554 | } | |
555 | ||
556 | /** | |
557 | * Undo the DMA mapping from @s with VFIO, and remove from mapping list. | |
558 | */ | |
559 | static void qemu_vfio_undo_mapping(QEMUVFIOState *s, IOVAMapping *mapping, | |
560 | Error **errp) | |
561 | { | |
562 | int index; | |
563 | struct vfio_iommu_type1_dma_unmap unmap = { | |
564 | .argsz = sizeof(unmap), | |
565 | .flags = 0, | |
566 | .iova = mapping->iova, | |
567 | .size = mapping->size, | |
568 | }; | |
569 | ||
570 | index = mapping - s->mappings; | |
571 | assert(mapping->size > 0); | |
572 | assert(QEMU_IS_ALIGNED(mapping->size, getpagesize())); | |
573 | assert(index >= 0 && index < s->nr_mappings); | |
574 | if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) { | |
575 | error_setg(errp, "VFIO_UNMAP_DMA failed: %d", -errno); | |
576 | } | |
577 | memmove(mapping, &s->mappings[index + 1], | |
578 | sizeof(s->mappings[0]) * (s->nr_mappings - index - 1)); | |
579 | s->nr_mappings--; | |
580 | s->mappings = g_realloc_n(s->mappings, sizeof(s->mappings[0]), | |
581 | s->nr_mappings); | |
582 | } | |
583 | ||
584 | /* Check if the mapping list is (ascending) ordered. */ | |
585 | static bool qemu_vfio_verify_mappings(QEMUVFIOState *s) | |
586 | { | |
587 | int i; | |
588 | if (QEMU_VFIO_DEBUG) { | |
589 | for (i = 0; i < s->nr_mappings - 1; ++i) { | |
590 | if (!(s->mappings[i].host < s->mappings[i + 1].host)) { | |
591 | fprintf(stderr, "item %d not sorted!\n", i); | |
592 | qemu_vfio_dump_mappings(s); | |
593 | return false; | |
594 | } | |
595 | if (!(s->mappings[i].host + s->mappings[i].size <= | |
596 | s->mappings[i + 1].host)) { | |
597 | fprintf(stderr, "item %d overlap with next!\n", i); | |
598 | qemu_vfio_dump_mappings(s); | |
599 | return false; | |
600 | } | |
601 | } | |
602 | } | |
603 | return true; | |
604 | } | |
605 | ||
606 | /* Map [host, host + size) area into a contiguous IOVA address space, and store | |
607 | * the result in @iova if not NULL. The caller need to make sure the area is | |
608 | * aligned to page size, and mustn't overlap with existing mapping areas (split | |
609 | * mapping status within this area is not allowed). | |
610 | */ | |
611 | int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size, | |
612 | bool temporary, uint64_t *iova) | |
613 | { | |
614 | int ret = 0; | |
615 | int index; | |
616 | IOVAMapping *mapping; | |
617 | uint64_t iova0; | |
618 | ||
619 | assert(QEMU_PTR_IS_ALIGNED(host, getpagesize())); | |
620 | assert(QEMU_IS_ALIGNED(size, getpagesize())); | |
621 | trace_qemu_vfio_dma_map(s, host, size, temporary, iova); | |
622 | qemu_mutex_lock(&s->lock); | |
623 | mapping = qemu_vfio_find_mapping(s, host, &index); | |
624 | if (mapping) { | |
625 | iova0 = mapping->iova + ((uint8_t *)host - (uint8_t *)mapping->host); | |
626 | } else { | |
627 | if (s->high_water_mark - s->low_water_mark + 1 < size) { | |
628 | ret = -ENOMEM; | |
629 | goto out; | |
630 | } | |
631 | if (!temporary) { | |
632 | iova0 = s->low_water_mark; | |
633 | mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0); | |
634 | if (!mapping) { | |
635 | ret = -ENOMEM; | |
636 | goto out; | |
637 | } | |
638 | assert(qemu_vfio_verify_mappings(s)); | |
639 | ret = qemu_vfio_do_mapping(s, host, size, iova0); | |
640 | if (ret) { | |
641 | qemu_vfio_undo_mapping(s, mapping, NULL); | |
642 | goto out; | |
643 | } | |
644 | s->low_water_mark += size; | |
645 | qemu_vfio_dump_mappings(s); | |
646 | } else { | |
647 | iova0 = s->high_water_mark - size; | |
648 | ret = qemu_vfio_do_mapping(s, host, size, iova0); | |
649 | if (ret) { | |
650 | goto out; | |
651 | } | |
652 | s->high_water_mark -= size; | |
653 | } | |
654 | } | |
655 | if (iova) { | |
656 | *iova = iova0; | |
657 | } | |
658 | out: | |
659 | qemu_mutex_unlock(&s->lock); | |
660 | return ret; | |
661 | } | |
662 | ||
663 | /* Reset the high watermark and free all "temporary" mappings. */ | |
664 | int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s) | |
665 | { | |
666 | struct vfio_iommu_type1_dma_unmap unmap = { | |
667 | .argsz = sizeof(unmap), | |
668 | .flags = 0, | |
669 | .iova = s->high_water_mark, | |
670 | .size = QEMU_VFIO_IOVA_MAX - s->high_water_mark, | |
671 | }; | |
672 | trace_qemu_vfio_dma_reset_temporary(s); | |
673 | qemu_mutex_lock(&s->lock); | |
674 | if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) { | |
675 | error_report("VFIO_UNMAP_DMA: %d", -errno); | |
676 | qemu_mutex_unlock(&s->lock); | |
677 | return -errno; | |
678 | } | |
679 | s->high_water_mark = QEMU_VFIO_IOVA_MAX; | |
680 | qemu_mutex_unlock(&s->lock); | |
681 | return 0; | |
682 | } | |
683 | ||
684 | /* Unmapping the whole area that was previously mapped with | |
685 | * qemu_vfio_dma_map(). */ | |
686 | void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host) | |
687 | { | |
688 | int index = 0; | |
689 | IOVAMapping *m; | |
690 | ||
691 | if (!host) { | |
692 | return; | |
693 | } | |
694 | ||
695 | trace_qemu_vfio_dma_unmap(s, host); | |
696 | qemu_mutex_lock(&s->lock); | |
697 | m = qemu_vfio_find_mapping(s, host, &index); | |
698 | if (!m) { | |
699 | goto out; | |
700 | } | |
701 | qemu_vfio_undo_mapping(s, m, NULL); | |
702 | out: | |
703 | qemu_mutex_unlock(&s->lock); | |
704 | } | |
705 | ||
706 | static void qemu_vfio_reset(QEMUVFIOState *s) | |
707 | { | |
708 | ioctl(s->device, VFIO_DEVICE_RESET); | |
709 | } | |
710 | ||
711 | /* Close and free the VFIO resources. */ | |
712 | void qemu_vfio_close(QEMUVFIOState *s) | |
713 | { | |
714 | int i; | |
715 | ||
716 | if (!s) { | |
717 | return; | |
718 | } | |
719 | for (i = 0; i < s->nr_mappings; ++i) { | |
720 | qemu_vfio_undo_mapping(s, &s->mappings[i], NULL); | |
721 | } | |
722 | ram_block_notifier_remove(&s->ram_notifier); | |
723 | qemu_vfio_reset(s); | |
724 | close(s->device); | |
725 | close(s->group); | |
726 | close(s->container); | |
727 | } |