]> git.proxmox.com Git - ceph.git/blame - ceph/src/spdk/dpdk/lib/librte_eal/linuxapp/eal/eal_vfio.c
update download target update for octopus release
[ceph.git] / ceph / src / spdk / dpdk / lib / librte_eal / linuxapp / eal / eal_vfio.c
CommitLineData
11fdf7f2
TL
1/* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2018 Intel Corporation
3 */
4
5#include <inttypes.h>
6#include <string.h>
7#include <fcntl.h>
8#include <unistd.h>
9#include <sys/ioctl.h>
10
11#include <rte_errno.h>
12#include <rte_log.h>
13#include <rte_memory.h>
14#include <rte_eal_memconfig.h>
15#include <rte_vfio.h>
16
17#include "eal_filesystem.h"
18#include "eal_vfio.h"
19#include "eal_private.h"
20
21#ifdef VFIO_PRESENT
22
23#define VFIO_MEM_EVENT_CLB_NAME "vfio_mem_event_clb"
24
25/* hot plug/unplug of VFIO groups may cause all DMA maps to be dropped. we can
26 * recreate the mappings for DPDK segments, but we cannot do so for memory that
27 * was registered by the user themselves, so we need to store the user mappings
28 * somewhere, to recreate them later.
29 */
30#define VFIO_MAX_USER_MEM_MAPS 256
31struct user_mem_map {
32 uint64_t addr;
33 uint64_t iova;
34 uint64_t len;
35};
36
37struct user_mem_maps {
38 rte_spinlock_recursive_t lock;
39 int n_maps;
40 struct user_mem_map maps[VFIO_MAX_USER_MEM_MAPS];
41};
42
43struct vfio_config {
44 int vfio_enabled;
45 int vfio_container_fd;
46 int vfio_active_groups;
47 const struct vfio_iommu_type *vfio_iommu_type;
48 struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
49 struct user_mem_maps mem_maps;
50};
51
52/* per-process VFIO config */
53static struct vfio_config vfio_cfgs[VFIO_MAX_CONTAINERS];
54static struct vfio_config *default_vfio_cfg = &vfio_cfgs[0];
55
56static int vfio_type1_dma_map(int);
57static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
58static int vfio_spapr_dma_map(int);
59static int vfio_spapr_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
60static int vfio_noiommu_dma_map(int);
61static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
62static int vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr,
63 uint64_t iova, uint64_t len, int do_map);
64
65/* IOMMU types we support */
66static const struct vfio_iommu_type iommu_types[] = {
67 /* x86 IOMMU, otherwise known as type 1 */
68 {
69 .type_id = RTE_VFIO_TYPE1,
70 .name = "Type 1",
71 .dma_map_func = &vfio_type1_dma_map,
72 .dma_user_map_func = &vfio_type1_dma_mem_map
73 },
74 /* ppc64 IOMMU, otherwise known as spapr */
75 {
76 .type_id = RTE_VFIO_SPAPR,
77 .name = "sPAPR",
78 .dma_map_func = &vfio_spapr_dma_map,
79 .dma_user_map_func = &vfio_spapr_dma_mem_map
80 },
81 /* IOMMU-less mode */
82 {
83 .type_id = RTE_VFIO_NOIOMMU,
84 .name = "No-IOMMU",
85 .dma_map_func = &vfio_noiommu_dma_map,
86 .dma_user_map_func = &vfio_noiommu_dma_mem_map
87 },
88};
89
90static int
91is_null_map(const struct user_mem_map *map)
92{
93 return map->addr == 0 && map->iova == 0 && map->len == 0;
94}
95
96/* we may need to merge user mem maps together in case of user mapping/unmapping
97 * chunks of memory, so we'll need a comparator function to sort segments.
98 */
99static int
100user_mem_map_cmp(const void *a, const void *b)
101{
102 const struct user_mem_map *umm_a = a;
103 const struct user_mem_map *umm_b = b;
104
105 /* move null entries to end */
106 if (is_null_map(umm_a))
107 return 1;
108 if (is_null_map(umm_b))
109 return -1;
110
111 /* sort by iova first */
112 if (umm_a->iova < umm_b->iova)
113 return -1;
114 if (umm_a->iova > umm_b->iova)
115 return 1;
116
117 if (umm_a->addr < umm_b->addr)
118 return -1;
119 if (umm_a->addr > umm_b->addr)
120 return 1;
121
122 if (umm_a->len < umm_b->len)
123 return -1;
124 if (umm_a->len > umm_b->len)
125 return 1;
126
127 return 0;
128}
129
130/* adjust user map entry. this may result in shortening of existing map, or in
131 * splitting existing map in two pieces.
132 */
133static void
134adjust_map(struct user_mem_map *src, struct user_mem_map *end,
135 uint64_t remove_va_start, uint64_t remove_len)
136{
137 /* if va start is same as start address, we're simply moving start */
138 if (remove_va_start == src->addr) {
139 src->addr += remove_len;
140 src->iova += remove_len;
141 src->len -= remove_len;
142 } else if (remove_va_start + remove_len == src->addr + src->len) {
143 /* we're shrinking mapping from the end */
144 src->len -= remove_len;
145 } else {
146 /* we're blowing a hole in the middle */
147 struct user_mem_map tmp;
148 uint64_t total_len = src->len;
149
150 /* adjust source segment length */
151 src->len = remove_va_start - src->addr;
152
153 /* create temporary segment in the middle */
154 tmp.addr = src->addr + src->len;
155 tmp.iova = src->iova + src->len;
156 tmp.len = remove_len;
157
158 /* populate end segment - this one we will be keeping */
159 end->addr = tmp.addr + tmp.len;
160 end->iova = tmp.iova + tmp.len;
161 end->len = total_len - src->len - tmp.len;
162 }
163}
164
165/* try merging two maps into one, return 1 if succeeded */
166static int
167merge_map(struct user_mem_map *left, struct user_mem_map *right)
168{
169 if (left->addr + left->len != right->addr)
170 return 0;
171 if (left->iova + left->len != right->iova)
172 return 0;
173
174 left->len += right->len;
175
176 memset(right, 0, sizeof(*right));
177
178 return 1;
179}
180
181static struct user_mem_map *
182find_user_mem_map(struct user_mem_maps *user_mem_maps, uint64_t addr,
183 uint64_t iova, uint64_t len)
184{
185 uint64_t va_end = addr + len;
186 uint64_t iova_end = iova + len;
187 int i;
188
189 for (i = 0; i < user_mem_maps->n_maps; i++) {
190 struct user_mem_map *map = &user_mem_maps->maps[i];
191 uint64_t map_va_end = map->addr + map->len;
192 uint64_t map_iova_end = map->iova + map->len;
193
194 /* check start VA */
195 if (addr < map->addr || addr >= map_va_end)
196 continue;
197 /* check if VA end is within boundaries */
198 if (va_end <= map->addr || va_end > map_va_end)
199 continue;
200
201 /* check start IOVA */
202 if (iova < map->iova || iova >= map_iova_end)
203 continue;
204 /* check if IOVA end is within boundaries */
205 if (iova_end <= map->iova || iova_end > map_iova_end)
206 continue;
207
208 /* we've found our map */
209 return map;
210 }
211 return NULL;
212}
213
214/* this will sort all user maps, and merge/compact any adjacent maps */
215static void
216compact_user_maps(struct user_mem_maps *user_mem_maps)
217{
218 int i, n_merged, cur_idx;
219
220 qsort(user_mem_maps->maps, user_mem_maps->n_maps,
221 sizeof(user_mem_maps->maps[0]), user_mem_map_cmp);
222
223 /* we'll go over the list backwards when merging */
224 n_merged = 0;
225 for (i = user_mem_maps->n_maps - 2; i >= 0; i--) {
226 struct user_mem_map *l, *r;
227
228 l = &user_mem_maps->maps[i];
229 r = &user_mem_maps->maps[i + 1];
230
231 if (is_null_map(l) || is_null_map(r))
232 continue;
233
234 if (merge_map(l, r))
235 n_merged++;
236 }
237
238 /* the entries are still sorted, but now they have holes in them, so
239 * walk through the list and remove the holes
240 */
241 if (n_merged > 0) {
242 cur_idx = 0;
243 for (i = 0; i < user_mem_maps->n_maps; i++) {
244 if (!is_null_map(&user_mem_maps->maps[i])) {
245 struct user_mem_map *src, *dst;
246
247 src = &user_mem_maps->maps[i];
248 dst = &user_mem_maps->maps[cur_idx++];
249
250 if (src != dst) {
251 memcpy(dst, src, sizeof(*src));
252 memset(src, 0, sizeof(*src));
253 }
254 }
255 }
256 user_mem_maps->n_maps = cur_idx;
257 }
258}
259
260static int
261vfio_open_group_fd(int iommu_group_num)
262{
263 int vfio_group_fd;
264 char filename[PATH_MAX];
265 struct rte_mp_msg mp_req, *mp_rep;
266 struct rte_mp_reply mp_reply;
267 struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
268 struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
269
270 /* if primary, try to open the group */
271 if (internal_config.process_type == RTE_PROC_PRIMARY) {
272 /* try regular group format */
273 snprintf(filename, sizeof(filename),
274 VFIO_GROUP_FMT, iommu_group_num);
275 vfio_group_fd = open(filename, O_RDWR);
276 if (vfio_group_fd < 0) {
277 /* if file not found, it's not an error */
278 if (errno != ENOENT) {
279 RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
280 strerror(errno));
281 return -1;
282 }
283
284 /* special case: try no-IOMMU path as well */
285 snprintf(filename, sizeof(filename),
286 VFIO_NOIOMMU_GROUP_FMT,
287 iommu_group_num);
288 vfio_group_fd = open(filename, O_RDWR);
289 if (vfio_group_fd < 0) {
290 if (errno != ENOENT) {
291 RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
292 strerror(errno));
293 return -1;
294 }
295 return 0;
296 }
297 /* noiommu group found */
298 }
299
300 return vfio_group_fd;
301 }
302 /* if we're in a secondary process, request group fd from the primary
303 * process via mp channel.
304 */
305 p->req = SOCKET_REQ_GROUP;
306 p->group_num = iommu_group_num;
307 strcpy(mp_req.name, EAL_VFIO_MP);
308 mp_req.len_param = sizeof(*p);
309 mp_req.num_fds = 0;
310
311 vfio_group_fd = -1;
312 if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
313 mp_reply.nb_received == 1) {
314 mp_rep = &mp_reply.msgs[0];
315 p = (struct vfio_mp_param *)mp_rep->param;
316 if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
317 vfio_group_fd = mp_rep->fds[0];
318 } else if (p->result == SOCKET_NO_FD) {
319 RTE_LOG(ERR, EAL, " bad VFIO group fd\n");
320 vfio_group_fd = 0;
321 }
322 free(mp_reply.msgs);
323 }
324
325 if (vfio_group_fd < 0)
326 RTE_LOG(ERR, EAL, " cannot request group fd\n");
327 return vfio_group_fd;
328}
329
330static struct vfio_config *
331get_vfio_cfg_by_group_num(int iommu_group_num)
332{
333 struct vfio_config *vfio_cfg;
334 int i, j;
335
336 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
337 vfio_cfg = &vfio_cfgs[i];
338 for (j = 0; j < VFIO_MAX_GROUPS; j++) {
339 if (vfio_cfg->vfio_groups[j].group_num ==
340 iommu_group_num)
341 return vfio_cfg;
342 }
343 }
344
345 return NULL;
346}
347
348static struct vfio_config *
349get_vfio_cfg_by_group_fd(int vfio_group_fd)
350{
351 struct vfio_config *vfio_cfg;
352 int i, j;
353
354 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
355 vfio_cfg = &vfio_cfgs[i];
356 for (j = 0; j < VFIO_MAX_GROUPS; j++)
357 if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
358 return vfio_cfg;
359 }
360
361 return NULL;
362}
363
364static struct vfio_config *
365get_vfio_cfg_by_container_fd(int container_fd)
366{
367 int i;
368
369 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
370 if (vfio_cfgs[i].vfio_container_fd == container_fd)
371 return &vfio_cfgs[i];
372 }
373
374 return NULL;
375}
376
377int
378rte_vfio_get_group_fd(int iommu_group_num)
379{
380 int i;
381 int vfio_group_fd;
382 struct vfio_group *cur_grp;
383 struct vfio_config *vfio_cfg;
384
385 /* get the vfio_config it belongs to */
386 vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
387 vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
388
389 /* check if we already have the group descriptor open */
390 for (i = 0; i < VFIO_MAX_GROUPS; i++)
391 if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num)
392 return vfio_cfg->vfio_groups[i].fd;
393
394 /* Lets see first if there is room for a new group */
395 if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) {
396 RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
397 return -1;
398 }
399
400 /* Now lets get an index for the new group */
401 for (i = 0; i < VFIO_MAX_GROUPS; i++)
402 if (vfio_cfg->vfio_groups[i].group_num == -1) {
403 cur_grp = &vfio_cfg->vfio_groups[i];
404 break;
405 }
406
407 /* This should not happen */
408 if (i == VFIO_MAX_GROUPS) {
409 RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
410 return -1;
411 }
412
413 vfio_group_fd = vfio_open_group_fd(iommu_group_num);
414 if (vfio_group_fd < 0) {
415 RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_num);
416 return -1;
417 }
418
419 cur_grp->group_num = iommu_group_num;
420 cur_grp->fd = vfio_group_fd;
421 vfio_cfg->vfio_active_groups++;
422
423 return vfio_group_fd;
424}
425
426static int
427get_vfio_group_idx(int vfio_group_fd)
428{
429 struct vfio_config *vfio_cfg;
430 int i, j;
431
432 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
433 vfio_cfg = &vfio_cfgs[i];
434 for (j = 0; j < VFIO_MAX_GROUPS; j++)
435 if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
436 return j;
437 }
438
439 return -1;
440}
441
442static void
443vfio_group_device_get(int vfio_group_fd)
444{
445 struct vfio_config *vfio_cfg;
446 int i;
447
448 vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
449 if (vfio_cfg == NULL) {
450 RTE_LOG(ERR, EAL, " invalid group fd!\n");
451 return;
452 }
453
454 i = get_vfio_group_idx(vfio_group_fd);
455 if (i < 0 || i > (VFIO_MAX_GROUPS - 1))
456 RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i);
457 else
458 vfio_cfg->vfio_groups[i].devices++;
459}
460
461static void
462vfio_group_device_put(int vfio_group_fd)
463{
464 struct vfio_config *vfio_cfg;
465 int i;
466
467 vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
468 if (vfio_cfg == NULL) {
469 RTE_LOG(ERR, EAL, " invalid group fd!\n");
470 return;
471 }
472
473 i = get_vfio_group_idx(vfio_group_fd);
474 if (i < 0 || i > (VFIO_MAX_GROUPS - 1))
475 RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i);
476 else
477 vfio_cfg->vfio_groups[i].devices--;
478}
479
480static int
481vfio_group_device_count(int vfio_group_fd)
482{
483 struct vfio_config *vfio_cfg;
484 int i;
485
486 vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
487 if (vfio_cfg == NULL) {
488 RTE_LOG(ERR, EAL, " invalid group fd!\n");
489 return -1;
490 }
491
492 i = get_vfio_group_idx(vfio_group_fd);
493 if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) {
494 RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i);
495 return -1;
496 }
497
498 return vfio_cfg->vfio_groups[i].devices;
499}
500
501static void
502vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len,
503 void *arg __rte_unused)
504{
505 struct rte_memseg_list *msl;
506 struct rte_memseg *ms;
507 size_t cur_len = 0;
508
509 msl = rte_mem_virt2memseg_list(addr);
510
511 /* for IOVA as VA mode, no need to care for IOVA addresses */
512 if (rte_eal_iova_mode() == RTE_IOVA_VA) {
513 uint64_t vfio_va = (uint64_t)(uintptr_t)addr;
514 if (type == RTE_MEM_EVENT_ALLOC)
515 vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va,
516 len, 1);
517 else
518 vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va,
519 len, 0);
520 return;
521 }
522
523 /* memsegs are contiguous in memory */
524 ms = rte_mem_virt2memseg(addr, msl);
525 while (cur_len < len) {
526 if (type == RTE_MEM_EVENT_ALLOC)
527 vfio_dma_mem_map(default_vfio_cfg, ms->addr_64,
528 ms->iova, ms->len, 1);
529 else
530 vfio_dma_mem_map(default_vfio_cfg, ms->addr_64,
531 ms->iova, ms->len, 0);
532
533 cur_len += ms->len;
534 ++ms;
535 }
536}
537
538int
539rte_vfio_clear_group(int vfio_group_fd)
540{
541 int i;
542 struct vfio_config *vfio_cfg;
543
544 vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
545 if (vfio_cfg == NULL) {
546 RTE_LOG(ERR, EAL, " invalid group fd!\n");
547 return -1;
548 }
549
550 i = get_vfio_group_idx(vfio_group_fd);
551 if (i < 0)
552 return -1;
553 vfio_cfg->vfio_groups[i].group_num = -1;
554 vfio_cfg->vfio_groups[i].fd = -1;
555 vfio_cfg->vfio_groups[i].devices = 0;
556 vfio_cfg->vfio_active_groups--;
557
558 return 0;
559}
560
561int
562rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
563 int *vfio_dev_fd, struct vfio_device_info *device_info)
564{
565 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
566 rte_rwlock_t *mem_lock = &mcfg->memory_hotplug_lock;
567 struct vfio_group_status group_status = {
568 .argsz = sizeof(group_status)
569 };
570 struct vfio_config *vfio_cfg;
571 struct user_mem_maps *user_mem_maps;
572 int vfio_container_fd;
573 int vfio_group_fd;
574 int iommu_group_num;
575 int i, ret;
576
577 /* get group number */
578 ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num);
579 if (ret == 0) {
580 RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n",
581 dev_addr);
582 return 1;
583 }
584
585 /* if negative, something failed */
586 if (ret < 0)
587 return -1;
588
589 /* get the actual group fd */
590 vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num);
591 if (vfio_group_fd < 0)
592 return -1;
593
594 /* if group_fd == 0, that means the device isn't managed by VFIO */
595 if (vfio_group_fd == 0) {
596 RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n",
597 dev_addr);
598 return 1;
599 }
600
601 /*
602 * at this point, we know that this group is viable (meaning, all devices
603 * are either bound to VFIO or not bound to anything)
604 */
605
606 /* check if the group is viable */
607 ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status);
608 if (ret) {
609 RTE_LOG(ERR, EAL, " %s cannot get group status, "
610 "error %i (%s)\n", dev_addr, errno, strerror(errno));
611 close(vfio_group_fd);
612 rte_vfio_clear_group(vfio_group_fd);
613 return -1;
614 } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
615 RTE_LOG(ERR, EAL, " %s VFIO group is not viable!\n", dev_addr);
616 close(vfio_group_fd);
617 rte_vfio_clear_group(vfio_group_fd);
618 return -1;
619 }
620
621 /* get the vfio_config it belongs to */
622 vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
623 vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
624 vfio_container_fd = vfio_cfg->vfio_container_fd;
625 user_mem_maps = &vfio_cfg->mem_maps;
626
627 /* check if group does not have a container yet */
628 if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) {
629
630 /* add group to a container */
631 ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER,
632 &vfio_container_fd);
633 if (ret) {
634 RTE_LOG(ERR, EAL, " %s cannot add VFIO group to container, "
635 "error %i (%s)\n", dev_addr, errno, strerror(errno));
636 close(vfio_group_fd);
637 rte_vfio_clear_group(vfio_group_fd);
638 return -1;
639 }
640
641 /*
642 * pick an IOMMU type and set up DMA mappings for container
643 *
644 * needs to be done only once, only when first group is
645 * assigned to a container and only in primary process.
646 * Note this can happen several times with the hotplug
647 * functionality.
648 */
649 if (internal_config.process_type == RTE_PROC_PRIMARY &&
650 vfio_cfg->vfio_active_groups == 1 &&
651 vfio_group_device_count(vfio_group_fd) == 0) {
652 const struct vfio_iommu_type *t;
653
654 /* select an IOMMU type which we will be using */
655 t = vfio_set_iommu_type(vfio_container_fd);
656 if (!t) {
657 RTE_LOG(ERR, EAL,
658 " %s failed to select IOMMU type\n",
659 dev_addr);
660 close(vfio_group_fd);
661 rte_vfio_clear_group(vfio_group_fd);
662 return -1;
663 }
664 /* lock memory hotplug before mapping and release it
665 * after registering callback, to prevent races
666 */
667 rte_rwlock_read_lock(mem_lock);
668 if (vfio_cfg == default_vfio_cfg &&
669 (internal_config.single_file_segments == 0 ||
670 internal_config.legacy_mem == 0))
671 ret = t->dma_map_func(vfio_container_fd);
672 else
673 ret = 0;
674 if (ret) {
675 RTE_LOG(ERR, EAL,
676 " %s DMA remapping failed, error %i (%s)\n",
677 dev_addr, errno, strerror(errno));
678 close(vfio_group_fd);
679 rte_vfio_clear_group(vfio_group_fd);
680 rte_rwlock_read_unlock(mem_lock);
681 return -1;
682 }
683
684 vfio_cfg->vfio_iommu_type = t;
685
686 /* re-map all user-mapped segments */
687 rte_spinlock_recursive_lock(&user_mem_maps->lock);
688
689 /* this IOMMU type may not support DMA mapping, but
690 * if we have mappings in the list - that means we have
691 * previously mapped something successfully, so we can
692 * be sure that DMA mapping is supported.
693 */
694 for (i = 0; i < user_mem_maps->n_maps; i++) {
695 struct user_mem_map *map;
696 map = &user_mem_maps->maps[i];
697
698 ret = t->dma_user_map_func(
699 vfio_container_fd,
700 map->addr, map->iova, map->len,
701 1);
702 if (ret) {
703 RTE_LOG(ERR, EAL, "Couldn't map user memory for DMA: "
704 "va: 0x%" PRIx64 " "
705 "iova: 0x%" PRIx64 " "
706 "len: 0x%" PRIu64 "\n",
707 map->addr, map->iova,
708 map->len);
709 rte_spinlock_recursive_unlock(
710 &user_mem_maps->lock);
711 rte_rwlock_read_unlock(mem_lock);
712 return -1;
713 }
714 }
715 rte_spinlock_recursive_unlock(&user_mem_maps->lock);
716
717 /* register callback for mem events */
718 if (vfio_cfg == default_vfio_cfg)
719 ret = rte_mem_event_callback_register(
720 VFIO_MEM_EVENT_CLB_NAME,
721 vfio_mem_event_callback, NULL);
722 else
723 ret = 0;
724 /* unlock memory hotplug */
725 rte_rwlock_read_unlock(mem_lock);
726
727 if (ret && rte_errno != ENOTSUP) {
728 RTE_LOG(ERR, EAL, "Could not install memory event callback for VFIO\n");
729 return -1;
730 }
731 if (ret)
732 RTE_LOG(DEBUG, EAL, "Memory event callbacks not supported\n");
733 else
734 RTE_LOG(DEBUG, EAL, "Installed memory event callback for VFIO\n");
735 }
736 }
737
738 /* get a file descriptor for the device */
739 *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr);
740 if (*vfio_dev_fd < 0) {
741 /* if we cannot get a device fd, this implies a problem with
742 * the VFIO group or the container not having IOMMU configured.
743 */
744
745 RTE_LOG(WARNING, EAL, "Getting a vfio_dev_fd for %s failed\n",
746 dev_addr);
747 close(vfio_group_fd);
748 rte_vfio_clear_group(vfio_group_fd);
749 return -1;
750 }
751
752 /* test and setup the device */
753 ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info);
754 if (ret) {
755 RTE_LOG(ERR, EAL, " %s cannot get device info, "
756 "error %i (%s)\n", dev_addr, errno,
757 strerror(errno));
758 close(*vfio_dev_fd);
759 close(vfio_group_fd);
760 rte_vfio_clear_group(vfio_group_fd);
761 return -1;
762 }
763 vfio_group_device_get(vfio_group_fd);
764
765 return 0;
766}
767
768int
769rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
770 int vfio_dev_fd)
771{
772 struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
773 rte_rwlock_t *mem_lock = &mcfg->memory_hotplug_lock;
774 struct vfio_group_status group_status = {
775 .argsz = sizeof(group_status)
776 };
777 struct vfio_config *vfio_cfg;
778 int vfio_group_fd;
779 int iommu_group_num;
780 int ret;
781
782 /* we don't want any DMA mapping messages to come while we're detaching
783 * VFIO device, because this might be the last device and we might need
784 * to unregister the callback.
785 */
786 rte_rwlock_read_lock(mem_lock);
787
788 /* get group number */
789 ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num);
790 if (ret <= 0) {
791 RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver\n",
792 dev_addr);
793 /* This is an error at this point. */
794 ret = -1;
795 goto out;
796 }
797
798 /* get the actual group fd */
799 vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num);
800 if (vfio_group_fd <= 0) {
801 RTE_LOG(INFO, EAL, "rte_vfio_get_group_fd failed for %s\n",
802 dev_addr);
803 ret = -1;
804 goto out;
805 }
806
807 /* get the vfio_config it belongs to */
808 vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
809 vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
810
811 /* At this point we got an active group. Closing it will make the
812 * container detachment. If this is the last active group, VFIO kernel
813 * code will unset the container and the IOMMU mappings.
814 */
815
816 /* Closing a device */
817 if (close(vfio_dev_fd) < 0) {
818 RTE_LOG(INFO, EAL, "Error when closing vfio_dev_fd for %s\n",
819 dev_addr);
820 ret = -1;
821 goto out;
822 }
823
824 /* An VFIO group can have several devices attached. Just when there is
825 * no devices remaining should the group be closed.
826 */
827 vfio_group_device_put(vfio_group_fd);
828 if (!vfio_group_device_count(vfio_group_fd)) {
829
830 if (close(vfio_group_fd) < 0) {
831 RTE_LOG(INFO, EAL, "Error when closing vfio_group_fd for %s\n",
832 dev_addr);
833 ret = -1;
834 goto out;
835 }
836
837 if (rte_vfio_clear_group(vfio_group_fd) < 0) {
838 RTE_LOG(INFO, EAL, "Error when clearing group for %s\n",
839 dev_addr);
840 ret = -1;
841 goto out;
842 }
843 }
844
845 /* if there are no active device groups, unregister the callback to
846 * avoid spurious attempts to map/unmap memory from VFIO.
847 */
848 if (vfio_cfg == default_vfio_cfg && vfio_cfg->vfio_active_groups == 0)
849 rte_mem_event_callback_unregister(VFIO_MEM_EVENT_CLB_NAME,
850 NULL);
851
852 /* success */
853 ret = 0;
854
855out:
856 rte_rwlock_read_unlock(mem_lock);
857 return ret;
858}
859
860int
861rte_vfio_enable(const char *modname)
862{
863 /* initialize group list */
864 int i, j;
865 int vfio_available;
866
867 rte_spinlock_recursive_t lock = RTE_SPINLOCK_RECURSIVE_INITIALIZER;
868
869 for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
870 vfio_cfgs[i].vfio_container_fd = -1;
871 vfio_cfgs[i].vfio_active_groups = 0;
872 vfio_cfgs[i].vfio_iommu_type = NULL;
873 vfio_cfgs[i].mem_maps.lock = lock;
874
875 for (j = 0; j < VFIO_MAX_GROUPS; j++) {
876 vfio_cfgs[i].vfio_groups[j].fd = -1;
877 vfio_cfgs[i].vfio_groups[j].group_num = -1;
878 vfio_cfgs[i].vfio_groups[j].devices = 0;
879 }
880 }
881
882 /* inform the user that we are probing for VFIO */
883 RTE_LOG(INFO, EAL, "Probing VFIO support...\n");
884
885 /* check if vfio module is loaded */
886 vfio_available = rte_eal_check_module(modname);
887
888 /* return error directly */
889 if (vfio_available == -1) {
890 RTE_LOG(INFO, EAL, "Could not get loaded module details!\n");
891 return -1;
892 }
893
894 /* return 0 if VFIO modules not loaded */
895 if (vfio_available == 0) {
896 RTE_LOG(DEBUG, EAL, "VFIO modules not loaded, "
897 "skipping VFIO support...\n");
898 return 0;
899 }
900
901 default_vfio_cfg->vfio_container_fd = rte_vfio_get_container_fd();
902
903 /* check if we have VFIO driver enabled */
904 if (default_vfio_cfg->vfio_container_fd != -1) {
905 RTE_LOG(NOTICE, EAL, "VFIO support initialized\n");
906 default_vfio_cfg->vfio_enabled = 1;
907 } else {
908 RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n");
909 }
910
911 return 0;
912}
913
914int
915rte_vfio_is_enabled(const char *modname)
916{
917 const int mod_available = rte_eal_check_module(modname) > 0;
918 return default_vfio_cfg->vfio_enabled && mod_available;
919}
920
921const struct vfio_iommu_type *
922vfio_set_iommu_type(int vfio_container_fd)
923{
924 unsigned idx;
925 for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
926 const struct vfio_iommu_type *t = &iommu_types[idx];
927
928 int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
929 t->type_id);
930 if (!ret) {
931 RTE_LOG(NOTICE, EAL, " using IOMMU type %d (%s)\n",
932 t->type_id, t->name);
933 return t;
934 }
935 /* not an error, there may be more supported IOMMU types */
936 RTE_LOG(DEBUG, EAL, " set IOMMU type %d (%s) failed, "
937 "error %i (%s)\n", t->type_id, t->name, errno,
938 strerror(errno));
939 }
940 /* if we didn't find a suitable IOMMU type, fail */
941 return NULL;
942}
943
944int
945vfio_has_supported_extensions(int vfio_container_fd)
946{
947 int ret;
948 unsigned idx, n_extensions = 0;
949 for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
950 const struct vfio_iommu_type *t = &iommu_types[idx];
951
952 ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION,
953 t->type_id);
954 if (ret < 0) {
955 RTE_LOG(ERR, EAL, " could not get IOMMU type, "
956 "error %i (%s)\n", errno,
957 strerror(errno));
958 close(vfio_container_fd);
959 return -1;
960 } else if (ret == 1) {
961 /* we found a supported extension */
962 n_extensions++;
963 }
964 RTE_LOG(DEBUG, EAL, " IOMMU type %d (%s) is %s\n",
965 t->type_id, t->name,
966 ret ? "supported" : "not supported");
967 }
968
969 /* if we didn't find any supported IOMMU types, fail */
970 if (!n_extensions) {
971 close(vfio_container_fd);
972 return -1;
973 }
974
975 return 0;
976}
977
978int
979rte_vfio_get_container_fd(void)
980{
981 int ret, vfio_container_fd;
982 struct rte_mp_msg mp_req, *mp_rep;
983 struct rte_mp_reply mp_reply;
984 struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
985 struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
986
987
988 /* if we're in a primary process, try to open the container */
989 if (internal_config.process_type == RTE_PROC_PRIMARY) {
990 vfio_container_fd = open(VFIO_CONTAINER_PATH, O_RDWR);
991 if (vfio_container_fd < 0) {
992 RTE_LOG(ERR, EAL, " cannot open VFIO container, "
993 "error %i (%s)\n", errno, strerror(errno));
994 return -1;
995 }
996
997 /* check VFIO API version */
998 ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION);
999 if (ret != VFIO_API_VERSION) {
1000 if (ret < 0)
1001 RTE_LOG(ERR, EAL, " could not get VFIO API version, "
1002 "error %i (%s)\n", errno, strerror(errno));
1003 else
1004 RTE_LOG(ERR, EAL, " unsupported VFIO API version!\n");
1005 close(vfio_container_fd);
1006 return -1;
1007 }
1008
1009 ret = vfio_has_supported_extensions(vfio_container_fd);
1010 if (ret) {
1011 RTE_LOG(ERR, EAL, " no supported IOMMU "
1012 "extensions found!\n");
1013 return -1;
1014 }
1015
1016 return vfio_container_fd;
1017 }
1018 /*
1019 * if we're in a secondary process, request container fd from the
1020 * primary process via mp channel
1021 */
1022 p->req = SOCKET_REQ_CONTAINER;
1023 strcpy(mp_req.name, EAL_VFIO_MP);
1024 mp_req.len_param = sizeof(*p);
1025 mp_req.num_fds = 0;
1026
1027 vfio_container_fd = -1;
1028 if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
1029 mp_reply.nb_received == 1) {
1030 mp_rep = &mp_reply.msgs[0];
1031 p = (struct vfio_mp_param *)mp_rep->param;
1032 if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
1033 free(mp_reply.msgs);
1034 return mp_rep->fds[0];
1035 }
1036 free(mp_reply.msgs);
1037 }
1038
1039 RTE_LOG(ERR, EAL, " cannot request container fd\n");
1040 return -1;
1041}
1042
1043int
1044rte_vfio_get_group_num(const char *sysfs_base,
1045 const char *dev_addr, int *iommu_group_num)
1046{
1047 char linkname[PATH_MAX];
1048 char filename[PATH_MAX];
1049 char *tok[16], *group_tok, *end;
1050 int ret;
1051
1052 memset(linkname, 0, sizeof(linkname));
1053 memset(filename, 0, sizeof(filename));
1054
1055 /* try to find out IOMMU group for this device */
1056 snprintf(linkname, sizeof(linkname),
1057 "%s/%s/iommu_group", sysfs_base, dev_addr);
1058
1059 ret = readlink(linkname, filename, sizeof(filename));
1060
1061 /* if the link doesn't exist, no VFIO for us */
1062 if (ret < 0)
1063 return 0;
1064
1065 ret = rte_strsplit(filename, sizeof(filename),
1066 tok, RTE_DIM(tok), '/');
1067
1068 if (ret <= 0) {
1069 RTE_LOG(ERR, EAL, " %s cannot get IOMMU group\n", dev_addr);
1070 return -1;
1071 }
1072
1073 /* IOMMU group is always the last token */
1074 errno = 0;
1075 group_tok = tok[ret - 1];
1076 end = group_tok;
1077 *iommu_group_num = strtol(group_tok, &end, 10);
1078 if ((end != group_tok && *end != '\0') || errno != 0) {
1079 RTE_LOG(ERR, EAL, " %s error parsing IOMMU number!\n", dev_addr);
1080 return -1;
1081 }
1082
1083 return 1;
1084}
1085
1086static int
1087type1_map(const struct rte_memseg_list *msl __rte_unused,
1088 const struct rte_memseg *ms, void *arg)
1089{
1090 int *vfio_container_fd = arg;
1091
1092 return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
1093 ms->len, 1);
1094}
1095
1096static int
1097vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
1098 uint64_t len, int do_map)
1099{
1100 struct vfio_iommu_type1_dma_map dma_map;
1101 struct vfio_iommu_type1_dma_unmap dma_unmap;
1102 int ret;
1103
1104 if (do_map != 0) {
1105 memset(&dma_map, 0, sizeof(dma_map));
1106 dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
1107 dma_map.vaddr = vaddr;
1108 dma_map.size = len;
1109 dma_map.iova = iova;
1110 dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
1111 VFIO_DMA_MAP_FLAG_WRITE;
1112
1113 ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
1114 if (ret) {
1115 RTE_LOG(ERR, EAL, " cannot set up DMA remapping, error %i (%s)\n",
1116 errno, strerror(errno));
1117 return -1;
1118 }
1119 } else {
1120 memset(&dma_unmap, 0, sizeof(dma_unmap));
1121 dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
1122 dma_unmap.size = len;
1123 dma_unmap.iova = iova;
1124
1125 ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
1126 &dma_unmap);
1127 if (ret) {
1128 RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n",
1129 errno, strerror(errno));
1130 return -1;
1131 }
1132 }
1133
1134 return 0;
1135}
1136
1137static int
1138vfio_type1_dma_map(int vfio_container_fd)
1139{
1140 return rte_memseg_walk(type1_map, &vfio_container_fd);
1141}
1142
1143static int
1144vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
1145 uint64_t len, int do_map)
1146{
1147 struct vfio_iommu_type1_dma_map dma_map;
1148 struct vfio_iommu_type1_dma_unmap dma_unmap;
1149 int ret;
1150
1151 if (do_map != 0) {
1152 memset(&dma_map, 0, sizeof(dma_map));
1153 dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
1154 dma_map.vaddr = vaddr;
1155 dma_map.size = len;
1156 dma_map.iova = iova;
1157 dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
1158 VFIO_DMA_MAP_FLAG_WRITE;
1159
1160 ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
1161 if (ret) {
1162 RTE_LOG(ERR, EAL, " cannot set up DMA remapping, error %i (%s)\n",
1163 errno, strerror(errno));
1164 return -1;
1165 }
1166
1167 } else {
1168 struct vfio_iommu_spapr_register_memory reg = {
1169 .argsz = sizeof(reg),
1170 .flags = 0
1171 };
1172 reg.vaddr = (uintptr_t) vaddr;
1173 reg.size = len;
1174
1175 ret = ioctl(vfio_container_fd,
1176 VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
1177 if (ret) {
1178 RTE_LOG(ERR, EAL, " cannot unregister vaddr for IOMMU, error %i (%s)\n",
1179 errno, strerror(errno));
1180 return -1;
1181 }
1182
1183 memset(&dma_unmap, 0, sizeof(dma_unmap));
1184 dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
1185 dma_unmap.size = len;
1186 dma_unmap.iova = iova;
1187
1188 ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
1189 &dma_unmap);
1190 if (ret) {
1191 RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n",
1192 errno, strerror(errno));
1193 return -1;
1194 }
1195 }
1196
1197 return 0;
1198}
1199
1200static int
1201vfio_spapr_map_walk(const struct rte_memseg_list *msl __rte_unused,
1202 const struct rte_memseg *ms, void *arg)
1203{
1204 int *vfio_container_fd = arg;
1205
1206 return vfio_spapr_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
1207 ms->len, 1);
1208}
1209
1210struct spapr_walk_param {
1211 uint64_t window_size;
1212 uint64_t hugepage_sz;
1213};
1214static int
1215vfio_spapr_window_size_walk(const struct rte_memseg_list *msl __rte_unused,
1216 const struct rte_memseg *ms, void *arg)
1217{
1218 struct spapr_walk_param *param = arg;
1219 uint64_t max = ms->iova + ms->len;
1220
1221 if (max > param->window_size) {
1222 param->hugepage_sz = ms->hugepage_sz;
1223 param->window_size = max;
1224 }
1225
1226 return 0;
1227}
1228
1229static int
1230vfio_spapr_create_new_dma_window(int vfio_container_fd,
1231 struct vfio_iommu_spapr_tce_create *create) {
1232 struct vfio_iommu_spapr_tce_remove remove = {
1233 .argsz = sizeof(remove),
1234 };
1235 struct vfio_iommu_spapr_tce_info info = {
1236 .argsz = sizeof(info),
1237 };
1238 int ret;
1239
1240 /* query spapr iommu info */
1241 ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
1242 if (ret) {
1243 RTE_LOG(ERR, EAL, " cannot get iommu info, "
1244 "error %i (%s)\n", errno, strerror(errno));
1245 return -1;
1246 }
1247
1248 /* remove default DMA of 32 bit window */
1249 remove.start_addr = info.dma32_window_start;
1250 ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
1251 if (ret) {
1252 RTE_LOG(ERR, EAL, " cannot remove default DMA window, "
1253 "error %i (%s)\n", errno, strerror(errno));
1254 return -1;
1255 }
1256
1257 /* create new DMA window */
1258 ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, create);
1259 if (ret) {
1260 RTE_LOG(ERR, EAL, " cannot create new DMA window, "
1261 "error %i (%s)\n", errno, strerror(errno));
1262 return -1;
1263 }
1264
1265 if (create->start_addr != 0) {
1266 RTE_LOG(ERR, EAL, " DMA window start address != 0\n");
1267 return -1;
1268 }
1269
1270 return 0;
1271}
1272
1273static int
1274vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
1275 uint64_t len, int do_map)
1276{
1277 struct spapr_walk_param param;
1278 struct vfio_iommu_spapr_tce_create create = {
1279 .argsz = sizeof(create),
1280 };
1281 struct vfio_config *vfio_cfg;
1282 struct user_mem_maps *user_mem_maps;
1283 int i, ret = 0;
1284
1285 vfio_cfg = get_vfio_cfg_by_container_fd(vfio_container_fd);
1286 if (vfio_cfg == NULL) {
1287 RTE_LOG(ERR, EAL, " invalid container fd!\n");
1288 return -1;
1289 }
1290
1291 user_mem_maps = &vfio_cfg->mem_maps;
1292 rte_spinlock_recursive_lock(&user_mem_maps->lock);
1293
1294 /* check if window size needs to be adjusted */
1295 memset(&param, 0, sizeof(param));
1296
1297 /* we're inside a callback so use thread-unsafe version */
1298 if (rte_memseg_walk_thread_unsafe(vfio_spapr_window_size_walk,
1299 &param) < 0) {
1300 RTE_LOG(ERR, EAL, "Could not get window size\n");
1301 ret = -1;
1302 goto out;
1303 }
1304
1305 /* also check user maps */
1306 for (i = 0; i < user_mem_maps->n_maps; i++) {
1307 uint64_t max = user_mem_maps->maps[i].iova +
1308 user_mem_maps->maps[i].len;
1309 create.window_size = RTE_MAX(create.window_size, max);
1310 }
1311
1312 /* sPAPR requires window size to be a power of 2 */
1313 create.window_size = rte_align64pow2(param.window_size);
1314 create.page_shift = __builtin_ctzll(param.hugepage_sz);
1315 create.levels = 1;
1316
1317 if (do_map) {
1318 void *addr;
1319 /* re-create window and remap the entire memory */
1320 if (iova > create.window_size) {
1321 if (vfio_spapr_create_new_dma_window(vfio_container_fd,
1322 &create) < 0) {
1323 RTE_LOG(ERR, EAL, "Could not create new DMA window\n");
1324 ret = -1;
1325 goto out;
1326 }
1327 /* we're inside a callback, so use thread-unsafe version
1328 */
1329 if (rte_memseg_walk_thread_unsafe(vfio_spapr_map_walk,
1330 &vfio_container_fd) < 0) {
1331 RTE_LOG(ERR, EAL, "Could not recreate DMA maps\n");
1332 ret = -1;
1333 goto out;
1334 }
1335 /* remap all user maps */
1336 for (i = 0; i < user_mem_maps->n_maps; i++) {
1337 struct user_mem_map *map =
1338 &user_mem_maps->maps[i];
1339 if (vfio_spapr_dma_do_map(vfio_container_fd,
1340 map->addr, map->iova, map->len,
1341 1)) {
1342 RTE_LOG(ERR, EAL, "Could not recreate user DMA maps\n");
1343 ret = -1;
1344 goto out;
1345 }
1346 }
1347 }
1348
1349 /* now that we've remapped all of the memory that was present
1350 * before, map the segment that we were requested to map.
1351 *
1352 * however, if we were called by the callback, the memory we
1353 * were called with was already in the memseg list, so previous
1354 * mapping should've mapped that segment already.
1355 *
1356 * virt2memseg_list is a relatively cheap check, so use that. if
1357 * memory is within any memseg list, it's a memseg, so it's
1358 * already mapped.
1359 */
1360 addr = (void *)(uintptr_t)vaddr;
1361 if (rte_mem_virt2memseg_list(addr) == NULL &&
1362 vfio_spapr_dma_do_map(vfio_container_fd,
1363 vaddr, iova, len, 1) < 0) {
1364 RTE_LOG(ERR, EAL, "Could not map segment\n");
1365 ret = -1;
1366 goto out;
1367 }
1368 } else {
1369 /* for unmap, check if iova within DMA window */
1370 if (iova > create.window_size) {
1371 RTE_LOG(ERR, EAL, "iova beyond DMA window for unmap");
1372 ret = -1;
1373 goto out;
1374 }
1375
1376 vfio_spapr_dma_do_map(vfio_container_fd, vaddr, iova, len, 0);
1377 }
1378out:
1379 rte_spinlock_recursive_unlock(&user_mem_maps->lock);
1380 return ret;
1381}
1382
1383static int
1384vfio_spapr_dma_map(int vfio_container_fd)
1385{
1386 struct vfio_iommu_spapr_tce_create create = {
1387 .argsz = sizeof(create),
1388 };
1389 struct spapr_walk_param param;
1390
1391 memset(&param, 0, sizeof(param));
1392
1393 /* create DMA window from 0 to max(phys_addr + len) */
1394 rte_memseg_walk(vfio_spapr_window_size_walk, &param);
1395
1396 /* sPAPR requires window size to be a power of 2 */
1397 create.window_size = rte_align64pow2(param.window_size);
1398 create.page_shift = __builtin_ctzll(param.hugepage_sz);
1399 create.levels = 1;
1400
1401 if (vfio_spapr_create_new_dma_window(vfio_container_fd, &create) < 0) {
1402 RTE_LOG(ERR, EAL, "Could not create new DMA window\n");
1403 return -1;
1404 }
1405
1406 /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
1407 if (rte_memseg_walk(vfio_spapr_map_walk, &vfio_container_fd) < 0)
1408 return -1;
1409
1410 return 0;
1411}
1412
1413static int
1414vfio_noiommu_dma_map(int __rte_unused vfio_container_fd)
1415{
1416 /* No-IOMMU mode does not need DMA mapping */
1417 return 0;
1418}
1419
1420static int
1421vfio_noiommu_dma_mem_map(int __rte_unused vfio_container_fd,
1422 uint64_t __rte_unused vaddr,
1423 uint64_t __rte_unused iova, uint64_t __rte_unused len,
1424 int __rte_unused do_map)
1425{
1426 /* No-IOMMU mode does not need DMA mapping */
1427 return 0;
1428}
1429
1430static int
1431vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
1432 uint64_t len, int do_map)
1433{
1434 const struct vfio_iommu_type *t = vfio_cfg->vfio_iommu_type;
1435
1436 if (!t) {
1437 RTE_LOG(ERR, EAL, " VFIO support not initialized\n");
1438 rte_errno = ENODEV;
1439 return -1;
1440 }
1441
1442 if (!t->dma_user_map_func) {
1443 RTE_LOG(ERR, EAL,
1444 " VFIO custom DMA region maping not supported by IOMMU %s\n",
1445 t->name);
1446 rte_errno = ENOTSUP;
1447 return -1;
1448 }
1449
1450 return t->dma_user_map_func(vfio_cfg->vfio_container_fd, vaddr, iova,
1451 len, do_map);
1452}
1453
1454static int
1455container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
1456 uint64_t len)
1457{
1458 struct user_mem_map *new_map;
1459 struct user_mem_maps *user_mem_maps;
1460 int ret = 0;
1461
1462 user_mem_maps = &vfio_cfg->mem_maps;
1463 rte_spinlock_recursive_lock(&user_mem_maps->lock);
1464 if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
1465 RTE_LOG(ERR, EAL, "No more space for user mem maps\n");
1466 rte_errno = ENOMEM;
1467 ret = -1;
1468 goto out;
1469 }
1470 /* map the entry */
1471 if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 1)) {
1472 /* technically, this will fail if there are currently no devices
1473 * plugged in, even if a device were added later, this mapping
1474 * might have succeeded. however, since we cannot verify if this
1475 * is a valid mapping without having a device attached, consider
1476 * this to be unsupported, because we can't just store any old
1477 * mapping and pollute list of active mappings willy-nilly.
1478 */
1479 RTE_LOG(ERR, EAL, "Couldn't map new region for DMA\n");
1480 ret = -1;
1481 goto out;
1482 }
1483 /* create new user mem map entry */
1484 new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
1485 new_map->addr = vaddr;
1486 new_map->iova = iova;
1487 new_map->len = len;
1488
1489 compact_user_maps(user_mem_maps);
1490out:
1491 rte_spinlock_recursive_unlock(&user_mem_maps->lock);
1492 return ret;
1493}
1494
1495static int
1496container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
1497 uint64_t len)
1498{
1499 struct user_mem_map *map, *new_map = NULL;
1500 struct user_mem_maps *user_mem_maps;
1501 int ret = 0;
1502
1503 user_mem_maps = &vfio_cfg->mem_maps;
1504 rte_spinlock_recursive_lock(&user_mem_maps->lock);
1505
1506 /* find our mapping */
1507 map = find_user_mem_map(user_mem_maps, vaddr, iova, len);
1508 if (!map) {
1509 RTE_LOG(ERR, EAL, "Couldn't find previously mapped region\n");
1510 rte_errno = EINVAL;
1511 ret = -1;
1512 goto out;
1513 }
1514 if (map->addr != vaddr || map->iova != iova || map->len != len) {
1515 /* we're partially unmapping a previously mapped region, so we
1516 * need to split entry into two.
1517 */
1518 if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
1519 RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n");
1520 rte_errno = ENOMEM;
1521 ret = -1;
1522 goto out;
1523 }
1524 new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
1525 }
1526
1527 /* unmap the entry */
1528 if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 0)) {
1529 /* there may not be any devices plugged in, so unmapping will
1530 * fail with ENODEV/ENOTSUP rte_errno values, but that doesn't
1531 * stop us from removing the mapping, as the assumption is we
1532 * won't be needing this memory any more and thus will want to
1533 * prevent it from being remapped again on hotplug. so, only
1534 * fail if we indeed failed to unmap (e.g. if the mapping was
1535 * within our mapped range but had invalid alignment).
1536 */
1537 if (rte_errno != ENODEV && rte_errno != ENOTSUP) {
1538 RTE_LOG(ERR, EAL, "Couldn't unmap region for DMA\n");
1539 ret = -1;
1540 goto out;
1541 } else {
1542 RTE_LOG(DEBUG, EAL, "DMA unmapping failed, but removing mappings anyway\n");
1543 }
1544 }
1545 /* remove map from the list of active mappings */
1546 if (new_map != NULL) {
1547 adjust_map(map, new_map, vaddr, len);
1548
1549 /* if we've created a new map by splitting, sort everything */
1550 if (!is_null_map(new_map)) {
1551 compact_user_maps(user_mem_maps);
1552 } else {
1553 /* we've created a new mapping, but it was unused */
1554 user_mem_maps->n_maps--;
1555 }
1556 } else {
1557 memset(map, 0, sizeof(*map));
1558 compact_user_maps(user_mem_maps);
1559 user_mem_maps->n_maps--;
1560 }
1561
1562out:
1563 rte_spinlock_recursive_unlock(&user_mem_maps->lock);
1564 return ret;
1565}
1566
1567int
1568rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len)
1569{
1570 if (len == 0) {
1571 rte_errno = EINVAL;
1572 return -1;
1573 }
1574
1575 return container_dma_map(default_vfio_cfg, vaddr, iova, len);
1576}
1577
1578int
1579rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len)
1580{
1581 if (len == 0) {
1582 rte_errno = EINVAL;
1583 return -1;
1584 }
1585
1586 return container_dma_unmap(default_vfio_cfg, vaddr, iova, len);
1587}
1588
1589int
1590rte_vfio_noiommu_is_enabled(void)
1591{
1592 int fd;
1593 ssize_t cnt;
1594 char c;
1595
1596 fd = open(VFIO_NOIOMMU_MODE, O_RDONLY);
1597 if (fd < 0) {
1598 if (errno != ENOENT) {
1599 RTE_LOG(ERR, EAL, " cannot open vfio noiommu file %i (%s)\n",
1600 errno, strerror(errno));
1601 return -1;
1602 }
1603 /*
1604 * else the file does not exists
1605 * i.e. noiommu is not enabled
1606 */
1607 return 0;
1608 }
1609
1610 cnt = read(fd, &c, 1);
1611 close(fd);
1612 if (cnt != 1) {
1613 RTE_LOG(ERR, EAL, " unable to read from vfio noiommu "
1614 "file %i (%s)\n", errno, strerror(errno));
1615 return -1;
1616 }
1617
1618 return c == 'Y';
1619}
1620
1621int
1622rte_vfio_container_create(void)
1623{
1624 int i;
1625
1626 /* Find an empty slot to store new vfio config */
1627 for (i = 1; i < VFIO_MAX_CONTAINERS; i++) {
1628 if (vfio_cfgs[i].vfio_container_fd == -1)
1629 break;
1630 }
1631
1632 if (i == VFIO_MAX_CONTAINERS) {
1633 RTE_LOG(ERR, EAL, "exceed max vfio container limit\n");
1634 return -1;
1635 }
1636
1637 vfio_cfgs[i].vfio_container_fd = rte_vfio_get_container_fd();
1638 if (vfio_cfgs[i].vfio_container_fd < 0) {
1639 RTE_LOG(NOTICE, EAL, "fail to create a new container\n");
1640 return -1;
1641 }
1642
1643 return vfio_cfgs[i].vfio_container_fd;
1644}
1645
1646int __rte_experimental
1647rte_vfio_container_destroy(int container_fd)
1648{
1649 struct vfio_config *vfio_cfg;
1650 int i;
1651
1652 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
1653 if (vfio_cfg == NULL) {
1654 RTE_LOG(ERR, EAL, "Invalid container fd\n");
1655 return -1;
1656 }
1657
1658 for (i = 0; i < VFIO_MAX_GROUPS; i++)
1659 if (vfio_cfg->vfio_groups[i].group_num != -1)
1660 rte_vfio_container_group_unbind(container_fd,
1661 vfio_cfg->vfio_groups[i].group_num);
1662
1663 close(container_fd);
1664 vfio_cfg->vfio_container_fd = -1;
1665 vfio_cfg->vfio_active_groups = 0;
1666 vfio_cfg->vfio_iommu_type = NULL;
1667
1668 return 0;
1669}
1670
1671int
1672rte_vfio_container_group_bind(int container_fd, int iommu_group_num)
1673{
1674 struct vfio_config *vfio_cfg;
1675 struct vfio_group *cur_grp;
1676 int vfio_group_fd;
1677 int i;
1678
1679 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
1680 if (vfio_cfg == NULL) {
1681 RTE_LOG(ERR, EAL, "Invalid container fd\n");
1682 return -1;
1683 }
1684
1685 /* Check room for new group */
1686 if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) {
1687 RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
1688 return -1;
1689 }
1690
1691 /* Get an index for the new group */
1692 for (i = 0; i < VFIO_MAX_GROUPS; i++)
1693 if (vfio_cfg->vfio_groups[i].group_num == -1) {
1694 cur_grp = &vfio_cfg->vfio_groups[i];
1695 break;
1696 }
1697
1698 /* This should not happen */
1699 if (i == VFIO_MAX_GROUPS) {
1700 RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
1701 return -1;
1702 }
1703
1704 vfio_group_fd = vfio_open_group_fd(iommu_group_num);
1705 if (vfio_group_fd < 0) {
1706 RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_num);
1707 return -1;
1708 }
1709 cur_grp->group_num = iommu_group_num;
1710 cur_grp->fd = vfio_group_fd;
1711 cur_grp->devices = 0;
1712 vfio_cfg->vfio_active_groups++;
1713
1714 return vfio_group_fd;
1715}
1716
1717int
1718rte_vfio_container_group_unbind(int container_fd, int iommu_group_num)
1719{
1720 struct vfio_config *vfio_cfg;
1721 struct vfio_group *cur_grp = NULL;
1722 int i;
1723
1724 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
1725 if (vfio_cfg == NULL) {
1726 RTE_LOG(ERR, EAL, "Invalid container fd\n");
1727 return -1;
1728 }
1729
1730 for (i = 0; i < VFIO_MAX_GROUPS; i++) {
1731 if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) {
1732 cur_grp = &vfio_cfg->vfio_groups[i];
1733 break;
1734 }
1735 }
1736
1737 /* This should not happen */
1738 if (i == VFIO_MAX_GROUPS || cur_grp == NULL) {
1739 RTE_LOG(ERR, EAL, "Specified group number not found\n");
1740 return -1;
1741 }
1742
1743 if (cur_grp->fd >= 0 && close(cur_grp->fd) < 0) {
1744 RTE_LOG(ERR, EAL, "Error when closing vfio_group_fd for"
1745 " iommu_group_num %d\n", iommu_group_num);
1746 return -1;
1747 }
1748 cur_grp->group_num = -1;
1749 cur_grp->fd = -1;
1750 cur_grp->devices = 0;
1751 vfio_cfg->vfio_active_groups--;
1752
1753 return 0;
1754}
1755
1756int
1757rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova,
1758 uint64_t len)
1759{
1760 struct vfio_config *vfio_cfg;
1761
1762 if (len == 0) {
1763 rte_errno = EINVAL;
1764 return -1;
1765 }
1766
1767 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
1768 if (vfio_cfg == NULL) {
1769 RTE_LOG(ERR, EAL, "Invalid container fd\n");
1770 return -1;
1771 }
1772
1773 return container_dma_map(vfio_cfg, vaddr, iova, len);
1774}
1775
1776int
1777rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova,
1778 uint64_t len)
1779{
1780 struct vfio_config *vfio_cfg;
1781
1782 if (len == 0) {
1783 rte_errno = EINVAL;
1784 return -1;
1785 }
1786
1787 vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
1788 if (vfio_cfg == NULL) {
1789 RTE_LOG(ERR, EAL, "Invalid container fd\n");
1790 return -1;
1791 }
1792
1793 return container_dma_unmap(vfio_cfg, vaddr, iova, len);
1794}
1795
1796#else
1797
1798int
1799rte_vfio_dma_map(uint64_t __rte_unused vaddr, __rte_unused uint64_t iova,
1800 __rte_unused uint64_t len)
1801{
1802 return -1;
1803}
1804
1805int
1806rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova,
1807 __rte_unused uint64_t len)
1808{
1809 return -1;
1810}
1811
1812int
1813rte_vfio_setup_device(__rte_unused const char *sysfs_base,
1814 __rte_unused const char *dev_addr,
1815 __rte_unused int *vfio_dev_fd,
1816 __rte_unused struct vfio_device_info *device_info)
1817{
1818 return -1;
1819}
1820
1821int
1822rte_vfio_release_device(__rte_unused const char *sysfs_base,
1823 __rte_unused const char *dev_addr, __rte_unused int fd)
1824{
1825 return -1;
1826}
1827
1828int
1829rte_vfio_enable(__rte_unused const char *modname)
1830{
1831 return -1;
1832}
1833
1834int
1835rte_vfio_is_enabled(__rte_unused const char *modname)
1836{
1837 return -1;
1838}
1839
1840int
1841rte_vfio_noiommu_is_enabled(void)
1842{
1843 return -1;
1844}
1845
1846int
1847rte_vfio_clear_group(__rte_unused int vfio_group_fd)
1848{
1849 return -1;
1850}
1851
1852int
1853rte_vfio_get_group_num(__rte_unused const char *sysfs_base,
1854 __rte_unused const char *dev_addr,
1855 __rte_unused int *iommu_group_num)
1856{
1857 return -1;
1858}
1859
1860int
1861rte_vfio_get_container_fd(void)
1862{
1863 return -1;
1864}
1865
1866int
1867rte_vfio_get_group_fd(__rte_unused int iommu_group_num)
1868{
1869 return -1;
1870}
1871
1872int
1873rte_vfio_container_create(void)
1874{
1875 return -1;
1876}
1877
1878int
1879rte_vfio_container_destroy(__rte_unused int container_fd)
1880{
1881 return -1;
1882}
1883
1884int
1885rte_vfio_container_group_bind(__rte_unused int container_fd,
1886 __rte_unused int iommu_group_num)
1887{
1888 return -1;
1889}
1890
1891int
1892rte_vfio_container_group_unbind(__rte_unused int container_fd,
1893 __rte_unused int iommu_group_num)
1894{
1895 return -1;
1896}
1897
1898int
1899rte_vfio_container_dma_map(__rte_unused int container_fd,
1900 __rte_unused uint64_t vaddr,
1901 __rte_unused uint64_t iova,
1902 __rte_unused uint64_t len)
1903{
1904 return -1;
1905}
1906
1907int
1908rte_vfio_container_dma_unmap(__rte_unused int container_fd,
1909 __rte_unused uint64_t vaddr,
1910 __rte_unused uint64_t iova,
1911 __rte_unused uint64_t len)
1912{
1913 return -1;
1914}
1915
1916#endif /* VFIO_PRESENT */