]>
Commit | Line | Data |
---|---|---|
418026ca FZ |
1 | /* |
2 | * VFIO utility | |
3 | * | |
4 | * Copyright 2016 - 2018 Red Hat, Inc. | |
5 | * | |
6 | * Authors: | |
7 | * Fam Zheng <famz@redhat.com> | |
8 | * | |
9 | * This work is licensed under the terms of the GNU GPL, version 2 or later. | |
10 | * See the COPYING file in the top-level directory. | |
11 | */ | |
12 | ||
13 | #include "qemu/osdep.h" | |
14 | #include <sys/ioctl.h> | |
15 | #include <linux/vfio.h> | |
16 | #include "qapi/error.h" | |
17 | #include "exec/ramlist.h" | |
18 | #include "exec/cpu-common.h" | |
b430b513 | 19 | #include "exec/memory.h" |
418026ca | 20 | #include "trace.h" |
418026ca FZ |
21 | #include "qemu/error-report.h" |
22 | #include "standard-headers/linux/pci_regs.h" | |
23 | #include "qemu/event_notifier.h" | |
24 | #include "qemu/vfio-helpers.h" | |
6e8a355d | 25 | #include "qemu/lockable.h" |
418026ca FZ |
26 | #include "trace.h" |
27 | ||
28 | #define QEMU_VFIO_DEBUG 0 | |
29 | ||
30 | #define QEMU_VFIO_IOVA_MIN 0x10000ULL | |
31 | /* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface, | |
32 | * we can use a runtime limit; alternatively it's also possible to do platform | |
33 | * specific detection by reading sysfs entries. Until then, 39 is a safe bet. | |
34 | **/ | |
35 | #define QEMU_VFIO_IOVA_MAX (1ULL << 39) | |
36 | ||
37 | typedef struct { | |
38 | /* Page aligned addr. */ | |
39 | void *host; | |
40 | size_t size; | |
41 | uint64_t iova; | |
42 | } IOVAMapping; | |
43 | ||
4487d420 EA |
44 | struct IOVARange { |
45 | uint64_t start; | |
46 | uint64_t end; | |
47 | }; | |
48 | ||
418026ca FZ |
49 | struct QEMUVFIOState { |
50 | QemuMutex lock; | |
51 | ||
52 | /* These fields are protected by BQL */ | |
53 | int container; | |
54 | int group; | |
55 | int device; | |
56 | RAMBlockNotifier ram_notifier; | |
57 | struct vfio_region_info config_region_info, bar_region_info[6]; | |
4487d420 EA |
58 | struct IOVARange *usable_iova_ranges; |
59 | uint8_t nb_iova_ranges; | |
418026ca FZ |
60 | |
61 | /* These fields are protected by @lock */ | |
62 | /* VFIO's IO virtual address space is managed by splitting into a few | |
63 | * sections: | |
64 | * | |
65 | * --------------- <= 0 | |
66 | * |xxxxxxxxxxxxx| | |
67 | * |-------------| <= QEMU_VFIO_IOVA_MIN | |
68 | * | | | |
69 | * | Fixed | | |
70 | * | | | |
71 | * |-------------| <= low_water_mark | |
72 | * | | | |
73 | * | Free | | |
74 | * | | | |
75 | * |-------------| <= high_water_mark | |
76 | * | | | |
77 | * | Temp | | |
78 | * | | | |
79 | * |-------------| <= QEMU_VFIO_IOVA_MAX | |
80 | * |xxxxxxxxxxxxx| | |
81 | * |xxxxxxxxxxxxx| | |
82 | * --------------- | |
83 | * | |
84 | * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid; | |
85 | * | |
86 | * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of | |
87 | * [QEMU_VFIO_IOVA_MIN, low_water_mark). Once allocated they will not be | |
88 | * reclaimed - low_water_mark never shrinks; | |
89 | * | |
90 | * - IOVAs in range [low_water_mark, high_water_mark) are free; | |
91 | * | |
92 | * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile | |
93 | * mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area | |
94 | * is recycled. The caller should make sure I/O's depending on these | |
95 | * mappings are completed before calling. | |
96 | **/ | |
97 | uint64_t low_water_mark; | |
98 | uint64_t high_water_mark; | |
99 | IOVAMapping *mappings; | |
100 | int nr_mappings; | |
101 | }; | |
102 | ||
103 | /** | |
104 | * Find group file by PCI device address as specified @device, and return the | |
105 | * path. The returned string is owned by caller and should be g_free'ed later. | |
106 | */ | |
107 | static char *sysfs_find_group_file(const char *device, Error **errp) | |
108 | { | |
dbdea0db | 109 | g_autoptr(GError) gerr = NULL; |
418026ca FZ |
110 | char *sysfs_link; |
111 | char *sysfs_group; | |
112 | char *p; | |
113 | char *path = NULL; | |
114 | ||
115 | sysfs_link = g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device); | |
dbdea0db AO |
116 | sysfs_group = g_file_read_link(sysfs_link, &gerr); |
117 | if (gerr) { | |
118 | error_setg(errp, "Failed to find iommu group sysfs path: %s", | |
119 | gerr->message); | |
418026ca FZ |
120 | goto out; |
121 | } | |
122 | p = strrchr(sysfs_group, '/'); | |
123 | if (!p) { | |
124 | error_setg(errp, "Failed to find iommu group number"); | |
125 | goto out; | |
126 | } | |
127 | ||
128 | path = g_strdup_printf("/dev/vfio/%s", p + 1); | |
129 | out: | |
130 | g_free(sysfs_link); | |
131 | g_free(sysfs_group); | |
132 | return path; | |
133 | } | |
134 | ||
135 | static inline void assert_bar_index_valid(QEMUVFIOState *s, int index) | |
136 | { | |
137 | assert(index >= 0 && index < ARRAY_SIZE(s->bar_region_info)); | |
138 | } | |
139 | ||
140 | static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp) | |
141 | { | |
df058222 | 142 | g_autofree char *barname = NULL; |
418026ca FZ |
143 | assert_bar_index_valid(s, index); |
144 | s->bar_region_info[index] = (struct vfio_region_info) { | |
145 | .index = VFIO_PCI_BAR0_REGION_INDEX + index, | |
146 | .argsz = sizeof(struct vfio_region_info), | |
147 | }; | |
148 | if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->bar_region_info[index])) { | |
149 | error_setg_errno(errp, errno, "Failed to get BAR region info"); | |
150 | return -errno; | |
151 | } | |
df058222 PMD |
152 | barname = g_strdup_printf("bar[%d]", index); |
153 | trace_qemu_vfio_region_info(barname, s->bar_region_info[index].offset, | |
154 | s->bar_region_info[index].size, | |
155 | s->bar_region_info[index].cap_offset); | |
418026ca FZ |
156 | |
157 | return 0; | |
158 | } | |
159 | ||
160 | /** | |
161 | * Map a PCI bar area. | |
162 | */ | |
163 | void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index, | |
b02c01a5 | 164 | uint64_t offset, uint64_t size, int prot, |
418026ca FZ |
165 | Error **errp) |
166 | { | |
167 | void *p; | |
8e3b0cbb | 168 | assert(QEMU_IS_ALIGNED(offset, qemu_real_host_page_size())); |
418026ca FZ |
169 | assert_bar_index_valid(s, index); |
170 | p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset), | |
b02c01a5 | 171 | prot, MAP_SHARED, |
418026ca | 172 | s->device, s->bar_region_info[index].offset + offset); |
2817fbce PMD |
173 | trace_qemu_vfio_pci_map_bar(index, s->bar_region_info[index].offset , |
174 | size, offset, p); | |
418026ca FZ |
175 | if (p == MAP_FAILED) { |
176 | error_setg_errno(errp, errno, "Failed to map BAR region"); | |
177 | p = NULL; | |
178 | } | |
179 | return p; | |
180 | } | |
181 | ||
182 | /** | |
183 | * Unmap a PCI bar area. | |
184 | */ | |
185 | void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar, | |
186 | uint64_t offset, uint64_t size) | |
187 | { | |
188 | if (bar) { | |
189 | munmap(bar, MIN(size, s->bar_region_info[index].size - offset)); | |
190 | } | |
191 | } | |
192 | ||
193 | /** | |
a6da793a | 194 | * Initialize device IRQ with @irq_type and register an event notifier. |
418026ca FZ |
195 | */ |
196 | int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e, | |
197 | int irq_type, Error **errp) | |
198 | { | |
199 | int r; | |
200 | struct vfio_irq_set *irq_set; | |
201 | size_t irq_set_size; | |
202 | struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; | |
203 | ||
204 | irq_info.index = irq_type; | |
205 | if (ioctl(s->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info)) { | |
206 | error_setg_errno(errp, errno, "Failed to get device interrupt info"); | |
207 | return -errno; | |
208 | } | |
209 | if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) { | |
210 | error_setg(errp, "Device interrupt doesn't support eventfd"); | |
211 | return -EINVAL; | |
212 | } | |
213 | ||
214 | irq_set_size = sizeof(*irq_set) + sizeof(int); | |
215 | irq_set = g_malloc0(irq_set_size); | |
216 | ||
217 | /* Get to a known IRQ state */ | |
218 | *irq_set = (struct vfio_irq_set) { | |
219 | .argsz = irq_set_size, | |
220 | .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER, | |
221 | .index = irq_info.index, | |
222 | .start = 0, | |
223 | .count = 1, | |
224 | }; | |
225 | ||
226 | *(int *)&irq_set->data = event_notifier_get_fd(e); | |
227 | r = ioctl(s->device, VFIO_DEVICE_SET_IRQS, irq_set); | |
228 | g_free(irq_set); | |
229 | if (r) { | |
230 | error_setg_errno(errp, errno, "Failed to setup device interrupt"); | |
231 | return -errno; | |
232 | } | |
233 | return 0; | |
234 | } | |
235 | ||
236 | static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf, | |
237 | int size, int ofs) | |
238 | { | |
239 | int ret; | |
240 | ||
3d87c2d9 PMD |
241 | trace_qemu_vfio_pci_read_config(buf, ofs, size, |
242 | s->config_region_info.offset, | |
243 | s->config_region_info.size); | |
244 | assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size)); | |
37b0b24e NI |
245 | ret = RETRY_ON_EINTR( |
246 | pread(s->device, buf, size, s->config_region_info.offset + ofs) | |
247 | ); | |
418026ca FZ |
248 | return ret == size ? 0 : -errno; |
249 | } | |
250 | ||
251 | static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int ofs) | |
252 | { | |
253 | int ret; | |
254 | ||
3d87c2d9 PMD |
255 | trace_qemu_vfio_pci_write_config(buf, ofs, size, |
256 | s->config_region_info.offset, | |
257 | s->config_region_info.size); | |
258 | assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size)); | |
37b0b24e NI |
259 | ret = RETRY_ON_EINTR( |
260 | pwrite(s->device, buf, size, s->config_region_info.offset + ofs) | |
261 | ); | |
418026ca FZ |
262 | return ret == size ? 0 : -errno; |
263 | } | |
264 | ||
4487d420 EA |
265 | static void collect_usable_iova_ranges(QEMUVFIOState *s, void *buf) |
266 | { | |
267 | struct vfio_iommu_type1_info *info = (struct vfio_iommu_type1_info *)buf; | |
268 | struct vfio_info_cap_header *cap = (void *)buf + info->cap_offset; | |
269 | struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range; | |
270 | int i; | |
271 | ||
272 | while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) { | |
273 | if (!cap->next) { | |
274 | return; | |
275 | } | |
3d558330 | 276 | cap = buf + cap->next; |
4487d420 EA |
277 | } |
278 | ||
279 | cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap; | |
280 | ||
281 | s->nb_iova_ranges = cap_iova_range->nr_iovas; | |
282 | if (s->nb_iova_ranges > 1) { | |
283 | s->usable_iova_ranges = | |
b21e2380 MA |
284 | g_renew(struct IOVARange, s->usable_iova_ranges, |
285 | s->nb_iova_ranges); | |
4487d420 EA |
286 | } |
287 | ||
288 | for (i = 0; i < s->nb_iova_ranges; i++) { | |
289 | s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start; | |
290 | s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end; | |
291 | } | |
292 | } | |
293 | ||
418026ca FZ |
294 | static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device, |
295 | Error **errp) | |
296 | { | |
297 | int ret; | |
298 | int i; | |
299 | uint16_t pci_cmd; | |
300 | struct vfio_group_status group_status = { .argsz = sizeof(group_status) }; | |
4487d420 EA |
301 | struct vfio_iommu_type1_info *iommu_info = NULL; |
302 | size_t iommu_info_size = sizeof(*iommu_info); | |
418026ca FZ |
303 | struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; |
304 | char *group_file = NULL; | |
305 | ||
4487d420 EA |
306 | s->usable_iova_ranges = NULL; |
307 | ||
418026ca FZ |
308 | /* Create a new container */ |
309 | s->container = open("/dev/vfio/vfio", O_RDWR); | |
310 | ||
311 | if (s->container == -1) { | |
312 | error_setg_errno(errp, errno, "Failed to open /dev/vfio/vfio"); | |
313 | return -errno; | |
314 | } | |
315 | if (ioctl(s->container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) { | |
316 | error_setg(errp, "Invalid VFIO version"); | |
317 | ret = -EINVAL; | |
318 | goto fail_container; | |
319 | } | |
320 | ||
321 | if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) { | |
a4bc212a | 322 | error_setg_errno(errp, errno, "VFIO IOMMU Type1 is not supported"); |
418026ca FZ |
323 | ret = -EINVAL; |
324 | goto fail_container; | |
325 | } | |
326 | ||
327 | /* Open the group */ | |
328 | group_file = sysfs_find_group_file(device, errp); | |
329 | if (!group_file) { | |
330 | ret = -EINVAL; | |
331 | goto fail_container; | |
332 | } | |
333 | ||
334 | s->group = open(group_file, O_RDWR); | |
335 | if (s->group == -1) { | |
336 | error_setg_errno(errp, errno, "Failed to open VFIO group file: %s", | |
337 | group_file); | |
338 | g_free(group_file); | |
339 | ret = -errno; | |
340 | goto fail_container; | |
341 | } | |
342 | g_free(group_file); | |
343 | ||
344 | /* Test the group is viable and available */ | |
345 | if (ioctl(s->group, VFIO_GROUP_GET_STATUS, &group_status)) { | |
346 | error_setg_errno(errp, errno, "Failed to get VFIO group status"); | |
347 | ret = -errno; | |
348 | goto fail; | |
349 | } | |
350 | ||
351 | if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { | |
352 | error_setg(errp, "VFIO group is not viable"); | |
353 | ret = -EINVAL; | |
354 | goto fail; | |
355 | } | |
356 | ||
357 | /* Add the group to the container */ | |
358 | if (ioctl(s->group, VFIO_GROUP_SET_CONTAINER, &s->container)) { | |
359 | error_setg_errno(errp, errno, "Failed to add group to VFIO container"); | |
360 | ret = -errno; | |
361 | goto fail; | |
362 | } | |
363 | ||
364 | /* Enable the IOMMU model we want */ | |
365 | if (ioctl(s->container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) { | |
366 | error_setg_errno(errp, errno, "Failed to set VFIO IOMMU type"); | |
367 | ret = -errno; | |
368 | goto fail; | |
369 | } | |
370 | ||
4487d420 EA |
371 | iommu_info = g_malloc0(iommu_info_size); |
372 | iommu_info->argsz = iommu_info_size; | |
373 | ||
418026ca | 374 | /* Get additional IOMMU info */ |
4487d420 | 375 | if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) { |
418026ca FZ |
376 | error_setg_errno(errp, errno, "Failed to get IOMMU info"); |
377 | ret = -errno; | |
378 | goto fail; | |
379 | } | |
380 | ||
4487d420 EA |
381 | /* |
382 | * if the kernel does not report usable IOVA regions, choose | |
383 | * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region | |
384 | */ | |
385 | s->nb_iova_ranges = 1; | |
386 | s->usable_iova_ranges = g_new0(struct IOVARange, 1); | |
387 | s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN; | |
388 | s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1; | |
389 | ||
390 | if (iommu_info->argsz > iommu_info_size) { | |
391 | iommu_info_size = iommu_info->argsz; | |
392 | iommu_info = g_realloc(iommu_info, iommu_info_size); | |
393 | if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) { | |
394 | ret = -errno; | |
395 | goto fail; | |
396 | } | |
397 | collect_usable_iova_ranges(s, iommu_info); | |
398 | } | |
399 | ||
418026ca FZ |
400 | s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device); |
401 | ||
402 | if (s->device < 0) { | |
403 | error_setg_errno(errp, errno, "Failed to get device fd"); | |
404 | ret = -errno; | |
405 | goto fail; | |
406 | } | |
407 | ||
408 | /* Test and setup the device */ | |
409 | if (ioctl(s->device, VFIO_DEVICE_GET_INFO, &device_info)) { | |
410 | error_setg_errno(errp, errno, "Failed to get device info"); | |
411 | ret = -errno; | |
412 | goto fail; | |
413 | } | |
414 | ||
415 | if (device_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) { | |
416 | error_setg(errp, "Invalid device regions"); | |
417 | ret = -EINVAL; | |
418 | goto fail; | |
419 | } | |
420 | ||
421 | s->config_region_info = (struct vfio_region_info) { | |
422 | .index = VFIO_PCI_CONFIG_REGION_INDEX, | |
423 | .argsz = sizeof(struct vfio_region_info), | |
424 | }; | |
425 | if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->config_region_info)) { | |
426 | error_setg_errno(errp, errno, "Failed to get config region info"); | |
427 | ret = -errno; | |
428 | goto fail; | |
429 | } | |
df058222 PMD |
430 | trace_qemu_vfio_region_info("config", s->config_region_info.offset, |
431 | s->config_region_info.size, | |
432 | s->config_region_info.cap_offset); | |
418026ca | 433 | |
9e722ebc | 434 | for (i = 0; i < ARRAY_SIZE(s->bar_region_info); i++) { |
418026ca FZ |
435 | ret = qemu_vfio_pci_init_bar(s, i, errp); |
436 | if (ret) { | |
437 | goto fail; | |
438 | } | |
439 | } | |
440 | ||
441 | /* Enable bus master */ | |
442 | ret = qemu_vfio_pci_read_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND); | |
443 | if (ret) { | |
444 | goto fail; | |
445 | } | |
446 | pci_cmd |= PCI_COMMAND_MASTER; | |
447 | ret = qemu_vfio_pci_write_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND); | |
448 | if (ret) { | |
449 | goto fail; | |
450 | } | |
4487d420 | 451 | g_free(iommu_info); |
418026ca FZ |
452 | return 0; |
453 | fail: | |
4487d420 EA |
454 | g_free(s->usable_iova_ranges); |
455 | s->usable_iova_ranges = NULL; | |
456 | s->nb_iova_ranges = 0; | |
457 | g_free(iommu_info); | |
418026ca FZ |
458 | close(s->group); |
459 | fail_container: | |
460 | close(s->container); | |
461 | return ret; | |
462 | } | |
463 | ||
8f44304c DH |
464 | static void qemu_vfio_ram_block_added(RAMBlockNotifier *n, void *host, |
465 | size_t size, size_t max_size) | |
418026ca FZ |
466 | { |
467 | QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); | |
521b97cd | 468 | Error *local_err = NULL; |
082851a3 DH |
469 | int ret; |
470 | ||
8f44304c | 471 | trace_qemu_vfio_ram_block_added(s, host, max_size); |
521b97cd | 472 | ret = qemu_vfio_dma_map(s, host, max_size, false, NULL, &local_err); |
082851a3 | 473 | if (ret) { |
521b97cd PMD |
474 | error_reportf_err(local_err, |
475 | "qemu_vfio_dma_map(%p, %zu) failed: ", | |
476 | host, max_size); | |
082851a3 | 477 | } |
418026ca FZ |
478 | } |
479 | ||
8f44304c DH |
480 | static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n, void *host, |
481 | size_t size, size_t max_size) | |
418026ca FZ |
482 | { |
483 | QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); | |
484 | if (host) { | |
8f44304c | 485 | trace_qemu_vfio_ram_block_removed(s, host, max_size); |
418026ca FZ |
486 | qemu_vfio_dma_unmap(s, host); |
487 | } | |
488 | } | |
489 | ||
418026ca FZ |
490 | static void qemu_vfio_open_common(QEMUVFIOState *s) |
491 | { | |
549b50a3 | 492 | qemu_mutex_init(&s->lock); |
418026ca FZ |
493 | s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added; |
494 | s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed; | |
418026ca FZ |
495 | s->low_water_mark = QEMU_VFIO_IOVA_MIN; |
496 | s->high_water_mark = QEMU_VFIO_IOVA_MAX; | |
082851a3 | 497 | ram_block_notifier_add(&s->ram_notifier); |
418026ca FZ |
498 | } |
499 | ||
500 | /** | |
501 | * Open a PCI device, e.g. "0000:00:01.0". | |
502 | */ | |
503 | QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp) | |
504 | { | |
505 | int r; | |
506 | QEMUVFIOState *s = g_new0(QEMUVFIOState, 1); | |
507 | ||
b430b513 DH |
508 | /* |
509 | * VFIO may pin all memory inside mappings, resulting it in pinning | |
510 | * all memory inside RAM blocks unconditionally. | |
511 | */ | |
512 | r = ram_block_discard_disable(true); | |
513 | if (r) { | |
514 | error_setg_errno(errp, -r, "Cannot set discarding of RAM broken"); | |
515 | g_free(s); | |
516 | return NULL; | |
517 | } | |
518 | ||
418026ca FZ |
519 | r = qemu_vfio_init_pci(s, device, errp); |
520 | if (r) { | |
b430b513 | 521 | ram_block_discard_disable(false); |
418026ca FZ |
522 | g_free(s); |
523 | return NULL; | |
524 | } | |
525 | qemu_vfio_open_common(s); | |
526 | return s; | |
527 | } | |
528 | ||
418026ca FZ |
529 | static void qemu_vfio_dump_mappings(QEMUVFIOState *s) |
530 | { | |
f6b8104d PMD |
531 | for (int i = 0; i < s->nr_mappings; ++i) { |
532 | trace_qemu_vfio_dump_mapping(s->mappings[i].host, | |
533 | s->mappings[i].iova, | |
534 | s->mappings[i].size); | |
418026ca FZ |
535 | } |
536 | } | |
537 | ||
538 | /** | |
539 | * Find the mapping entry that contains [host, host + size) and set @index to | |
540 | * the position. If no entry contains it, @index is the position _after_ which | |
541 | * to insert the new mapping. IOW, it is the index of the largest element that | |
542 | * is smaller than @host, or -1 if no entry is. | |
543 | */ | |
544 | static IOVAMapping *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host, | |
545 | int *index) | |
546 | { | |
547 | IOVAMapping *p = s->mappings; | |
548 | IOVAMapping *q = p ? p + s->nr_mappings - 1 : NULL; | |
549 | IOVAMapping *mid; | |
550 | trace_qemu_vfio_find_mapping(s, host); | |
551 | if (!p) { | |
552 | *index = -1; | |
553 | return NULL; | |
554 | } | |
555 | while (true) { | |
556 | mid = p + (q - p) / 2; | |
557 | if (mid == p) { | |
558 | break; | |
559 | } | |
560 | if (mid->host > host) { | |
561 | q = mid; | |
562 | } else if (mid->host < host) { | |
563 | p = mid; | |
564 | } else { | |
565 | break; | |
566 | } | |
567 | } | |
568 | if (mid->host > host) { | |
569 | mid--; | |
570 | } else if (mid < &s->mappings[s->nr_mappings - 1] | |
571 | && (mid + 1)->host <= host) { | |
572 | mid++; | |
573 | } | |
574 | *index = mid - &s->mappings[0]; | |
575 | if (mid >= &s->mappings[0] && | |
576 | mid->host <= host && mid->host + mid->size > host) { | |
577 | assert(mid < &s->mappings[s->nr_mappings]); | |
578 | return mid; | |
579 | } | |
580 | /* At this point *index + 1 is the right position to insert the new | |
581 | * mapping.*/ | |
582 | return NULL; | |
583 | } | |
584 | ||
585 | /** | |
a6da793a | 586 | * Allocate IOVA and create a new mapping record and insert it in @s. |
418026ca FZ |
587 | */ |
588 | static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s, | |
589 | void *host, size_t size, | |
590 | int index, uint64_t iova) | |
591 | { | |
592 | int shift; | |
593 | IOVAMapping m = {.host = host, .size = size, .iova = iova}; | |
594 | IOVAMapping *insert; | |
595 | ||
8e3b0cbb MAL |
596 | assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size())); |
597 | assert(QEMU_IS_ALIGNED(s->low_water_mark, qemu_real_host_page_size())); | |
598 | assert(QEMU_IS_ALIGNED(s->high_water_mark, qemu_real_host_page_size())); | |
418026ca FZ |
599 | trace_qemu_vfio_new_mapping(s, host, size, index, iova); |
600 | ||
601 | assert(index >= 0); | |
602 | s->nr_mappings++; | |
d29eb678 | 603 | s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings); |
418026ca FZ |
604 | insert = &s->mappings[index]; |
605 | shift = s->nr_mappings - index - 1; | |
606 | if (shift) { | |
607 | memmove(insert + 1, insert, shift * sizeof(s->mappings[0])); | |
608 | } | |
609 | *insert = m; | |
610 | return insert; | |
611 | } | |
612 | ||
613 | /* Do the DMA mapping with VFIO. */ | |
614 | static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size, | |
f38b376d | 615 | uint64_t iova, Error **errp) |
418026ca FZ |
616 | { |
617 | struct vfio_iommu_type1_dma_map dma_map = { | |
618 | .argsz = sizeof(dma_map), | |
619 | .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, | |
620 | .iova = iova, | |
621 | .vaddr = (uintptr_t)host, | |
622 | .size = size, | |
623 | }; | |
4c946b22 | 624 | trace_qemu_vfio_do_mapping(s, host, iova, size); |
418026ca FZ |
625 | |
626 | if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) { | |
f38b376d | 627 | error_setg_errno(errp, errno, "VFIO_MAP_DMA failed"); |
418026ca FZ |
628 | return -errno; |
629 | } | |
630 | return 0; | |
631 | } | |
632 | ||
633 | /** | |
634 | * Undo the DMA mapping from @s with VFIO, and remove from mapping list. | |
635 | */ | |
636 | static void qemu_vfio_undo_mapping(QEMUVFIOState *s, IOVAMapping *mapping, | |
637 | Error **errp) | |
638 | { | |
639 | int index; | |
640 | struct vfio_iommu_type1_dma_unmap unmap = { | |
641 | .argsz = sizeof(unmap), | |
642 | .flags = 0, | |
643 | .iova = mapping->iova, | |
644 | .size = mapping->size, | |
645 | }; | |
646 | ||
647 | index = mapping - s->mappings; | |
648 | assert(mapping->size > 0); | |
8e3b0cbb | 649 | assert(QEMU_IS_ALIGNED(mapping->size, qemu_real_host_page_size())); |
418026ca FZ |
650 | assert(index >= 0 && index < s->nr_mappings); |
651 | if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) { | |
b09d51c9 | 652 | error_setg_errno(errp, errno, "VFIO_UNMAP_DMA failed"); |
418026ca FZ |
653 | } |
654 | memmove(mapping, &s->mappings[index + 1], | |
655 | sizeof(s->mappings[0]) * (s->nr_mappings - index - 1)); | |
656 | s->nr_mappings--; | |
d29eb678 | 657 | s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings); |
418026ca FZ |
658 | } |
659 | ||
660 | /* Check if the mapping list is (ascending) ordered. */ | |
661 | static bool qemu_vfio_verify_mappings(QEMUVFIOState *s) | |
662 | { | |
663 | int i; | |
664 | if (QEMU_VFIO_DEBUG) { | |
665 | for (i = 0; i < s->nr_mappings - 1; ++i) { | |
666 | if (!(s->mappings[i].host < s->mappings[i + 1].host)) { | |
cb49dfce | 667 | error_report("item %d not sorted!", i); |
418026ca FZ |
668 | qemu_vfio_dump_mappings(s); |
669 | return false; | |
670 | } | |
671 | if (!(s->mappings[i].host + s->mappings[i].size <= | |
672 | s->mappings[i + 1].host)) { | |
cb49dfce | 673 | error_report("item %d overlap with next!", i); |
418026ca FZ |
674 | qemu_vfio_dump_mappings(s); |
675 | return false; | |
676 | } | |
677 | } | |
678 | } | |
679 | return true; | |
680 | } | |
681 | ||
453095e9 PMD |
682 | static bool qemu_vfio_find_fixed_iova(QEMUVFIOState *s, size_t size, |
683 | uint64_t *iova, Error **errp) | |
9ab57411 EA |
684 | { |
685 | int i; | |
686 | ||
687 | for (i = 0; i < s->nb_iova_ranges; i++) { | |
688 | if (s->usable_iova_ranges[i].end < s->low_water_mark) { | |
689 | continue; | |
690 | } | |
691 | s->low_water_mark = | |
692 | MAX(s->low_water_mark, s->usable_iova_ranges[i].start); | |
693 | ||
694 | if (s->usable_iova_ranges[i].end - s->low_water_mark + 1 >= size || | |
695 | s->usable_iova_ranges[i].end - s->low_water_mark + 1 == 0) { | |
696 | *iova = s->low_water_mark; | |
697 | s->low_water_mark += size; | |
453095e9 | 698 | return true; |
9ab57411 EA |
699 | } |
700 | } | |
453095e9 PMD |
701 | error_setg(errp, "fixed iova range not found"); |
702 | ||
703 | return false; | |
9ab57411 EA |
704 | } |
705 | ||
453095e9 PMD |
706 | static bool qemu_vfio_find_temp_iova(QEMUVFIOState *s, size_t size, |
707 | uint64_t *iova, Error **errp) | |
9ab57411 EA |
708 | { |
709 | int i; | |
710 | ||
711 | for (i = s->nb_iova_ranges - 1; i >= 0; i--) { | |
712 | if (s->usable_iova_ranges[i].start > s->high_water_mark) { | |
713 | continue; | |
714 | } | |
715 | s->high_water_mark = | |
716 | MIN(s->high_water_mark, s->usable_iova_ranges[i].end + 1); | |
717 | ||
718 | if (s->high_water_mark - s->usable_iova_ranges[i].start + 1 >= size || | |
719 | s->high_water_mark - s->usable_iova_ranges[i].start + 1 == 0) { | |
720 | *iova = s->high_water_mark - size; | |
721 | s->high_water_mark = *iova; | |
453095e9 | 722 | return true; |
9ab57411 EA |
723 | } |
724 | } | |
453095e9 PMD |
725 | error_setg(errp, "temporary iova range not found"); |
726 | ||
727 | return false; | |
9ab57411 EA |
728 | } |
729 | ||
71e3038c PMD |
730 | /** |
731 | * qemu_vfio_water_mark_reached: | |
732 | * | |
733 | * Returns %true if high watermark has been reached, %false otherwise. | |
734 | */ | |
735 | static bool qemu_vfio_water_mark_reached(QEMUVFIOState *s, size_t size, | |
736 | Error **errp) | |
737 | { | |
738 | if (s->high_water_mark - s->low_water_mark + 1 < size) { | |
739 | error_setg(errp, "iova exhausted (water mark reached)"); | |
740 | return true; | |
741 | } | |
742 | return false; | |
743 | } | |
744 | ||
418026ca FZ |
745 | /* Map [host, host + size) area into a contiguous IOVA address space, and store |
746 | * the result in @iova if not NULL. The caller need to make sure the area is | |
747 | * aligned to page size, and mustn't overlap with existing mapping areas (split | |
748 | * mapping status within this area is not allowed). | |
749 | */ | |
750 | int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size, | |
521b97cd | 751 | bool temporary, uint64_t *iova, Error **errp) |
418026ca | 752 | { |
418026ca FZ |
753 | int index; |
754 | IOVAMapping *mapping; | |
755 | uint64_t iova0; | |
756 | ||
8e3b0cbb MAL |
757 | assert(QEMU_PTR_IS_ALIGNED(host, qemu_real_host_page_size())); |
758 | assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size())); | |
418026ca | 759 | trace_qemu_vfio_dma_map(s, host, size, temporary, iova); |
a990858b | 760 | QEMU_LOCK_GUARD(&s->lock); |
418026ca FZ |
761 | mapping = qemu_vfio_find_mapping(s, host, &index); |
762 | if (mapping) { | |
763 | iova0 = mapping->iova + ((uint8_t *)host - (uint8_t *)mapping->host); | |
764 | } else { | |
5a4f1626 PMD |
765 | int ret; |
766 | ||
71e3038c | 767 | if (qemu_vfio_water_mark_reached(s, size, errp)) { |
5a4f1626 | 768 | return -ENOMEM; |
418026ca FZ |
769 | } |
770 | if (!temporary) { | |
453095e9 | 771 | if (!qemu_vfio_find_fixed_iova(s, size, &iova0, errp)) { |
5a4f1626 | 772 | return -ENOMEM; |
9ab57411 EA |
773 | } |
774 | ||
418026ca | 775 | mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0); |
418026ca | 776 | assert(qemu_vfio_verify_mappings(s)); |
f38b376d | 777 | ret = qemu_vfio_do_mapping(s, host, size, iova0, errp); |
5a4f1626 | 778 | if (ret < 0) { |
418026ca | 779 | qemu_vfio_undo_mapping(s, mapping, NULL); |
5a4f1626 | 780 | return ret; |
418026ca | 781 | } |
418026ca FZ |
782 | qemu_vfio_dump_mappings(s); |
783 | } else { | |
453095e9 | 784 | if (!qemu_vfio_find_temp_iova(s, size, &iova0, errp)) { |
5a4f1626 | 785 | return -ENOMEM; |
9ab57411 | 786 | } |
f38b376d | 787 | ret = qemu_vfio_do_mapping(s, host, size, iova0, errp); |
5a4f1626 PMD |
788 | if (ret < 0) { |
789 | return ret; | |
418026ca | 790 | } |
418026ca FZ |
791 | } |
792 | } | |
4c946b22 | 793 | trace_qemu_vfio_dma_mapped(s, host, iova0, size); |
418026ca FZ |
794 | if (iova) { |
795 | *iova = iova0; | |
796 | } | |
5a4f1626 | 797 | return 0; |
418026ca FZ |
798 | } |
799 | ||
800 | /* Reset the high watermark and free all "temporary" mappings. */ | |
801 | int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s) | |
802 | { | |
803 | struct vfio_iommu_type1_dma_unmap unmap = { | |
804 | .argsz = sizeof(unmap), | |
805 | .flags = 0, | |
806 | .iova = s->high_water_mark, | |
807 | .size = QEMU_VFIO_IOVA_MAX - s->high_water_mark, | |
808 | }; | |
809 | trace_qemu_vfio_dma_reset_temporary(s); | |
6e8a355d | 810 | QEMU_LOCK_GUARD(&s->lock); |
418026ca | 811 | if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) { |
b09d51c9 | 812 | error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno)); |
418026ca FZ |
813 | return -errno; |
814 | } | |
815 | s->high_water_mark = QEMU_VFIO_IOVA_MAX; | |
418026ca FZ |
816 | return 0; |
817 | } | |
818 | ||
819 | /* Unmapping the whole area that was previously mapped with | |
820 | * qemu_vfio_dma_map(). */ | |
821 | void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host) | |
822 | { | |
823 | int index = 0; | |
824 | IOVAMapping *m; | |
825 | ||
826 | if (!host) { | |
827 | return; | |
828 | } | |
829 | ||
830 | trace_qemu_vfio_dma_unmap(s, host); | |
a990858b | 831 | QEMU_LOCK_GUARD(&s->lock); |
418026ca FZ |
832 | m = qemu_vfio_find_mapping(s, host, &index); |
833 | if (!m) { | |
a990858b | 834 | return; |
418026ca FZ |
835 | } |
836 | qemu_vfio_undo_mapping(s, m, NULL); | |
418026ca FZ |
837 | } |
838 | ||
839 | static void qemu_vfio_reset(QEMUVFIOState *s) | |
840 | { | |
841 | ioctl(s->device, VFIO_DEVICE_RESET); | |
842 | } | |
843 | ||
844 | /* Close and free the VFIO resources. */ | |
845 | void qemu_vfio_close(QEMUVFIOState *s) | |
846 | { | |
847 | int i; | |
848 | ||
849 | if (!s) { | |
850 | return; | |
851 | } | |
1f0fea38 SH |
852 | |
853 | ram_block_notifier_remove(&s->ram_notifier); | |
854 | ||
418026ca FZ |
855 | for (i = 0; i < s->nr_mappings; ++i) { |
856 | qemu_vfio_undo_mapping(s, &s->mappings[i], NULL); | |
857 | } | |
1f0fea38 | 858 | |
4487d420 EA |
859 | g_free(s->usable_iova_ranges); |
860 | s->nb_iova_ranges = 0; | |
418026ca FZ |
861 | qemu_vfio_reset(s); |
862 | close(s->device); | |
863 | close(s->group); | |
864 | close(s->container); | |
b430b513 | 865 | ram_block_discard_disable(false); |
418026ca | 866 | } |