]> git.proxmox.com Git - mirror_qemu.git/blob - subprojects/libvduse/libvduse.c
hw/ufs: Fix incorrect register fields
[mirror_qemu.git] / subprojects / libvduse / libvduse.c
1 /*
2 * VDUSE (vDPA Device in Userspace) library
3 *
4 * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved.
5 * Portions of codes and concepts borrowed from libvhost-user.c, so:
6 * Copyright IBM, Corp. 2007
7 * Copyright (c) 2016 Red Hat, Inc.
8 *
9 * Author:
10 * Xie Yongji <xieyongji@bytedance.com>
11 * Anthony Liguori <aliguori@us.ibm.com>
12 * Marc-André Lureau <mlureau@redhat.com>
13 * Victor Kaplansky <victork@redhat.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2 or
16 * later. See the COPYING file in the top-level directory.
17 */
18
19 #ifndef _GNU_SOURCE
20 #define _GNU_SOURCE
21 #endif
22
23 #include <stdlib.h>
24 #include <stdio.h>
25 #include <stdbool.h>
26 #include <stddef.h>
27 #include <errno.h>
28 #include <string.h>
29 #include <assert.h>
30 #include <endian.h>
31 #include <unistd.h>
32 #include <limits.h>
33 #include <fcntl.h>
34 #include <inttypes.h>
35
36 #include <sys/ioctl.h>
37 #include <sys/eventfd.h>
38 #include <sys/mman.h>
39
40 #include "include/atomic.h"
41 #include "linux-headers/linux/virtio_ring.h"
42 #include "linux-headers/linux/virtio_config.h"
43 #include "linux-headers/linux/vduse.h"
44 #include "libvduse.h"
45
46 #define VDUSE_VQ_ALIGN 4096
47 #define MAX_IOVA_REGIONS 256
48
49 #define LOG_ALIGNMENT 64
50
51 /* Round number down to multiple */
52 #define ALIGN_DOWN(n, m) ((n) / (m) * (m))
53
54 /* Round number up to multiple */
55 #define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
56
57 #ifndef unlikely
58 #define unlikely(x) __builtin_expect(!!(x), 0)
59 #endif
60
61 typedef struct VduseDescStateSplit {
62 uint8_t inflight;
63 uint8_t padding[5];
64 uint16_t next;
65 uint64_t counter;
66 } VduseDescStateSplit;
67
68 typedef struct VduseVirtqLogInflight {
69 uint64_t features;
70 uint16_t version;
71 uint16_t desc_num;
72 uint16_t last_batch_head;
73 uint16_t used_idx;
74 VduseDescStateSplit desc[];
75 } VduseVirtqLogInflight;
76
77 typedef struct VduseVirtqLog {
78 VduseVirtqLogInflight inflight;
79 } VduseVirtqLog;
80
81 typedef struct VduseVirtqInflightDesc {
82 uint16_t index;
83 uint64_t counter;
84 } VduseVirtqInflightDesc;
85
86 typedef struct VduseRing {
87 unsigned int num;
88 uint64_t desc_addr;
89 uint64_t avail_addr;
90 uint64_t used_addr;
91 struct vring_desc *desc;
92 struct vring_avail *avail;
93 struct vring_used *used;
94 } VduseRing;
95
96 struct VduseVirtq {
97 VduseRing vring;
98 uint16_t last_avail_idx;
99 uint16_t shadow_avail_idx;
100 uint16_t used_idx;
101 uint16_t signalled_used;
102 bool signalled_used_valid;
103 int index;
104 unsigned int inuse;
105 bool ready;
106 int fd;
107 VduseDev *dev;
108 VduseVirtqInflightDesc *resubmit_list;
109 uint16_t resubmit_num;
110 uint64_t counter;
111 VduseVirtqLog *log;
112 };
113
114 typedef struct VduseIovaRegion {
115 uint64_t iova;
116 uint64_t size;
117 uint64_t mmap_offset;
118 uint64_t mmap_addr;
119 } VduseIovaRegion;
120
121 struct VduseDev {
122 VduseVirtq *vqs;
123 VduseIovaRegion regions[MAX_IOVA_REGIONS];
124 int num_regions;
125 char *name;
126 uint32_t device_id;
127 uint32_t vendor_id;
128 uint16_t num_queues;
129 uint16_t queue_size;
130 uint64_t features;
131 const VduseOps *ops;
132 int fd;
133 int ctrl_fd;
134 void *priv;
135 void *log;
136 };
137
138 static inline size_t vduse_vq_log_size(uint16_t queue_size)
139 {
140 return ALIGN_UP(sizeof(VduseDescStateSplit) * queue_size +
141 sizeof(VduseVirtqLogInflight), LOG_ALIGNMENT);
142 }
143
144 static void *vduse_log_get(const char *filename, size_t size)
145 {
146 void *ptr = MAP_FAILED;
147 int fd;
148
149 fd = open(filename, O_RDWR | O_CREAT, 0600);
150 if (fd == -1) {
151 return MAP_FAILED;
152 }
153
154 if (ftruncate(fd, size) == -1) {
155 goto out;
156 }
157
158 ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
159
160 out:
161 close(fd);
162 return ptr;
163 }
164
165 static inline bool has_feature(uint64_t features, unsigned int fbit)
166 {
167 assert(fbit < 64);
168 return !!(features & (1ULL << fbit));
169 }
170
171 static inline bool vduse_dev_has_feature(VduseDev *dev, unsigned int fbit)
172 {
173 return has_feature(dev->features, fbit);
174 }
175
176 uint64_t vduse_get_virtio_features(void)
177 {
178 return (1ULL << VIRTIO_F_IOMMU_PLATFORM) |
179 (1ULL << VIRTIO_F_VERSION_1) |
180 (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) |
181 (1ULL << VIRTIO_RING_F_EVENT_IDX) |
182 (1ULL << VIRTIO_RING_F_INDIRECT_DESC);
183 }
184
185 VduseDev *vduse_queue_get_dev(VduseVirtq *vq)
186 {
187 return vq->dev;
188 }
189
190 int vduse_queue_get_fd(VduseVirtq *vq)
191 {
192 return vq->fd;
193 }
194
195 void *vduse_dev_get_priv(VduseDev *dev)
196 {
197 return dev->priv;
198 }
199
200 VduseVirtq *vduse_dev_get_queue(VduseDev *dev, int index)
201 {
202 return &dev->vqs[index];
203 }
204
205 int vduse_dev_get_fd(VduseDev *dev)
206 {
207 return dev->fd;
208 }
209
210 static int vduse_inject_irq(VduseDev *dev, int index)
211 {
212 return ioctl(dev->fd, VDUSE_VQ_INJECT_IRQ, &index);
213 }
214
215 static int inflight_desc_compare(const void *a, const void *b)
216 {
217 VduseVirtqInflightDesc *desc0 = (VduseVirtqInflightDesc *)a,
218 *desc1 = (VduseVirtqInflightDesc *)b;
219
220 if (desc1->counter > desc0->counter &&
221 (desc1->counter - desc0->counter) < VIRTQUEUE_MAX_SIZE * 2) {
222 return 1;
223 }
224
225 return -1;
226 }
227
228 static int vduse_queue_check_inflights(VduseVirtq *vq)
229 {
230 int i = 0;
231 VduseDev *dev = vq->dev;
232
233 vq->used_idx = le16toh(vq->vring.used->idx);
234 vq->resubmit_num = 0;
235 vq->resubmit_list = NULL;
236 vq->counter = 0;
237
238 if (unlikely(vq->log->inflight.used_idx != vq->used_idx)) {
239 if (vq->log->inflight.last_batch_head > VIRTQUEUE_MAX_SIZE) {
240 return -1;
241 }
242
243 vq->log->inflight.desc[vq->log->inflight.last_batch_head].inflight = 0;
244
245 barrier();
246
247 vq->log->inflight.used_idx = vq->used_idx;
248 }
249
250 for (i = 0; i < vq->log->inflight.desc_num; i++) {
251 if (vq->log->inflight.desc[i].inflight == 1) {
252 vq->inuse++;
253 }
254 }
255
256 vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx;
257
258 if (vq->inuse) {
259 vq->resubmit_list = calloc(vq->inuse, sizeof(VduseVirtqInflightDesc));
260 if (!vq->resubmit_list) {
261 return -1;
262 }
263
264 for (i = 0; i < vq->log->inflight.desc_num; i++) {
265 if (vq->log->inflight.desc[i].inflight) {
266 vq->resubmit_list[vq->resubmit_num].index = i;
267 vq->resubmit_list[vq->resubmit_num].counter =
268 vq->log->inflight.desc[i].counter;
269 vq->resubmit_num++;
270 }
271 }
272
273 if (vq->resubmit_num > 1) {
274 qsort(vq->resubmit_list, vq->resubmit_num,
275 sizeof(VduseVirtqInflightDesc), inflight_desc_compare);
276 }
277 vq->counter = vq->resubmit_list[0].counter + 1;
278 }
279
280 vduse_inject_irq(dev, vq->index);
281
282 return 0;
283 }
284
285 static int vduse_queue_inflight_get(VduseVirtq *vq, int desc_idx)
286 {
287 vq->log->inflight.desc[desc_idx].counter = vq->counter++;
288
289 barrier();
290
291 vq->log->inflight.desc[desc_idx].inflight = 1;
292
293 return 0;
294 }
295
296 static int vduse_queue_inflight_pre_put(VduseVirtq *vq, int desc_idx)
297 {
298 vq->log->inflight.last_batch_head = desc_idx;
299
300 return 0;
301 }
302
303 static int vduse_queue_inflight_post_put(VduseVirtq *vq, int desc_idx)
304 {
305 vq->log->inflight.desc[desc_idx].inflight = 0;
306
307 barrier();
308
309 vq->log->inflight.used_idx = vq->used_idx;
310
311 return 0;
312 }
313
314 static void vduse_iova_remove_region(VduseDev *dev, uint64_t start,
315 uint64_t last)
316 {
317 int i;
318
319 if (last == start) {
320 return;
321 }
322
323 for (i = 0; i < MAX_IOVA_REGIONS; i++) {
324 if (!dev->regions[i].mmap_addr) {
325 continue;
326 }
327
328 if (start <= dev->regions[i].iova &&
329 last >= (dev->regions[i].iova + dev->regions[i].size - 1)) {
330 munmap((void *)(uintptr_t)dev->regions[i].mmap_addr,
331 dev->regions[i].mmap_offset + dev->regions[i].size);
332 dev->regions[i].mmap_addr = 0;
333 dev->num_regions--;
334 }
335 }
336 }
337
338 static int vduse_iova_add_region(VduseDev *dev, int fd,
339 uint64_t offset, uint64_t start,
340 uint64_t last, int prot)
341 {
342 int i;
343 uint64_t size = last - start + 1;
344 void *mmap_addr = mmap(0, size + offset, prot, MAP_SHARED, fd, 0);
345
346 if (mmap_addr == MAP_FAILED) {
347 close(fd);
348 return -EINVAL;
349 }
350
351 for (i = 0; i < MAX_IOVA_REGIONS; i++) {
352 if (!dev->regions[i].mmap_addr) {
353 dev->regions[i].mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
354 dev->regions[i].mmap_offset = offset;
355 dev->regions[i].iova = start;
356 dev->regions[i].size = size;
357 dev->num_regions++;
358 break;
359 }
360 }
361 assert(i < MAX_IOVA_REGIONS);
362 close(fd);
363
364 return 0;
365 }
366
367 static int perm_to_prot(uint8_t perm)
368 {
369 int prot = 0;
370
371 switch (perm) {
372 case VDUSE_ACCESS_WO:
373 prot |= PROT_WRITE;
374 break;
375 case VDUSE_ACCESS_RO:
376 prot |= PROT_READ;
377 break;
378 case VDUSE_ACCESS_RW:
379 prot |= PROT_READ | PROT_WRITE;
380 break;
381 default:
382 break;
383 }
384
385 return prot;
386 }
387
388 static inline void *iova_to_va(VduseDev *dev, uint64_t *plen, uint64_t iova)
389 {
390 int i, ret;
391 struct vduse_iotlb_entry entry;
392
393 for (i = 0; i < MAX_IOVA_REGIONS; i++) {
394 VduseIovaRegion *r = &dev->regions[i];
395
396 if (!r->mmap_addr) {
397 continue;
398 }
399
400 if ((iova >= r->iova) && (iova < (r->iova + r->size))) {
401 if ((iova + *plen) > (r->iova + r->size)) {
402 *plen = r->iova + r->size - iova;
403 }
404 return (void *)(uintptr_t)(iova - r->iova +
405 r->mmap_addr + r->mmap_offset);
406 }
407 }
408
409 entry.start = iova;
410 entry.last = iova + 1;
411 ret = ioctl(dev->fd, VDUSE_IOTLB_GET_FD, &entry);
412 if (ret < 0) {
413 return NULL;
414 }
415
416 if (!vduse_iova_add_region(dev, ret, entry.offset, entry.start,
417 entry.last, perm_to_prot(entry.perm))) {
418 return iova_to_va(dev, plen, iova);
419 }
420
421 return NULL;
422 }
423
424 static inline uint16_t vring_avail_flags(VduseVirtq *vq)
425 {
426 return le16toh(vq->vring.avail->flags);
427 }
428
429 static inline uint16_t vring_avail_idx(VduseVirtq *vq)
430 {
431 vq->shadow_avail_idx = le16toh(vq->vring.avail->idx);
432
433 return vq->shadow_avail_idx;
434 }
435
436 static inline uint16_t vring_avail_ring(VduseVirtq *vq, int i)
437 {
438 return le16toh(vq->vring.avail->ring[i]);
439 }
440
441 static inline uint16_t vring_get_used_event(VduseVirtq *vq)
442 {
443 return vring_avail_ring(vq, vq->vring.num);
444 }
445
446 static bool vduse_queue_get_head(VduseVirtq *vq, unsigned int idx,
447 unsigned int *head)
448 {
449 /*
450 * Grab the next descriptor number they're advertising, and increment
451 * the index we've seen.
452 */
453 *head = vring_avail_ring(vq, idx % vq->vring.num);
454
455 /* If their number is silly, that's a fatal mistake. */
456 if (*head >= vq->vring.num) {
457 fprintf(stderr, "Guest says index %u is available\n", *head);
458 return false;
459 }
460
461 return true;
462 }
463
464 static int
465 vduse_queue_read_indirect_desc(VduseDev *dev, struct vring_desc *desc,
466 uint64_t addr, size_t len)
467 {
468 struct vring_desc *ori_desc;
469 uint64_t read_len;
470
471 if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) {
472 return -1;
473 }
474
475 if (len == 0) {
476 return -1;
477 }
478
479 while (len) {
480 read_len = len;
481 ori_desc = iova_to_va(dev, &read_len, addr);
482 if (!ori_desc) {
483 return -1;
484 }
485
486 memcpy(desc, ori_desc, read_len);
487 len -= read_len;
488 addr += read_len;
489 desc += read_len;
490 }
491
492 return 0;
493 }
494
495 enum {
496 VIRTQUEUE_READ_DESC_ERROR = -1,
497 VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */
498 VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */
499 };
500
501 static int vduse_queue_read_next_desc(struct vring_desc *desc, int i,
502 unsigned int max, unsigned int *next)
503 {
504 /* If this descriptor says it doesn't chain, we're done. */
505 if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) {
506 return VIRTQUEUE_READ_DESC_DONE;
507 }
508
509 /* Check they're not leading us off end of descriptors. */
510 *next = desc[i].next;
511 /* Make sure compiler knows to grab that: we don't want it changing! */
512 smp_wmb();
513
514 if (*next >= max) {
515 fprintf(stderr, "Desc next is %u\n", *next);
516 return VIRTQUEUE_READ_DESC_ERROR;
517 }
518
519 return VIRTQUEUE_READ_DESC_MORE;
520 }
521
522 /*
523 * Fetch avail_idx from VQ memory only when we really need to know if
524 * guest has added some buffers.
525 */
526 static bool vduse_queue_empty(VduseVirtq *vq)
527 {
528 if (unlikely(!vq->vring.avail)) {
529 return true;
530 }
531
532 if (vq->shadow_avail_idx != vq->last_avail_idx) {
533 return false;
534 }
535
536 return vring_avail_idx(vq) == vq->last_avail_idx;
537 }
538
539 static bool vduse_queue_should_notify(VduseVirtq *vq)
540 {
541 VduseDev *dev = vq->dev;
542 uint16_t old, new;
543 bool v;
544
545 /* We need to expose used array entries before checking used event. */
546 smp_mb();
547
548 /* Always notify when queue is empty (when feature acknowledge) */
549 if (vduse_dev_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
550 !vq->inuse && vduse_queue_empty(vq)) {
551 return true;
552 }
553
554 if (!vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
555 return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT);
556 }
557
558 v = vq->signalled_used_valid;
559 vq->signalled_used_valid = true;
560 old = vq->signalled_used;
561 new = vq->signalled_used = vq->used_idx;
562 return !v || vring_need_event(vring_get_used_event(vq), new, old);
563 }
564
565 void vduse_queue_notify(VduseVirtq *vq)
566 {
567 VduseDev *dev = vq->dev;
568
569 if (unlikely(!vq->vring.avail)) {
570 return;
571 }
572
573 if (!vduse_queue_should_notify(vq)) {
574 return;
575 }
576
577 if (vduse_inject_irq(dev, vq->index) < 0) {
578 fprintf(stderr, "Error inject irq for vq %d: %s\n",
579 vq->index, strerror(errno));
580 }
581 }
582
583 static inline void vring_set_avail_event(VduseVirtq *vq, uint16_t val)
584 {
585 uint16_t val_le = htole16(val);
586 memcpy(&vq->vring.used->ring[vq->vring.num], &val_le, sizeof(uint16_t));
587 }
588
589 static bool vduse_queue_map_single_desc(VduseVirtq *vq, unsigned int *p_num_sg,
590 struct iovec *iov, unsigned int max_num_sg,
591 bool is_write, uint64_t pa, size_t sz)
592 {
593 unsigned num_sg = *p_num_sg;
594 VduseDev *dev = vq->dev;
595
596 assert(num_sg <= max_num_sg);
597
598 if (!sz) {
599 fprintf(stderr, "virtio: zero sized buffers are not allowed\n");
600 return false;
601 }
602
603 while (sz) {
604 uint64_t len = sz;
605
606 if (num_sg == max_num_sg) {
607 fprintf(stderr,
608 "virtio: too many descriptors in indirect table\n");
609 return false;
610 }
611
612 iov[num_sg].iov_base = iova_to_va(dev, &len, pa);
613 if (iov[num_sg].iov_base == NULL) {
614 fprintf(stderr, "virtio: invalid address for buffers\n");
615 return false;
616 }
617 iov[num_sg++].iov_len = len;
618 sz -= len;
619 pa += len;
620 }
621
622 *p_num_sg = num_sg;
623 return true;
624 }
625
626 static void *vduse_queue_alloc_element(size_t sz, unsigned out_num,
627 unsigned in_num)
628 {
629 VduseVirtqElement *elem;
630 size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0]));
631 size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]);
632 size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]);
633
634 assert(sz >= sizeof(VduseVirtqElement));
635 elem = malloc(out_sg_end);
636 if (!elem) {
637 return NULL;
638 }
639 elem->out_num = out_num;
640 elem->in_num = in_num;
641 elem->in_sg = (void *)elem + in_sg_ofs;
642 elem->out_sg = (void *)elem + out_sg_ofs;
643 return elem;
644 }
645
646 static void *vduse_queue_map_desc(VduseVirtq *vq, unsigned int idx, size_t sz)
647 {
648 struct vring_desc *desc = vq->vring.desc;
649 VduseDev *dev = vq->dev;
650 uint64_t desc_addr, read_len;
651 unsigned int desc_len;
652 unsigned int max = vq->vring.num;
653 unsigned int i = idx;
654 VduseVirtqElement *elem;
655 struct iovec iov[VIRTQUEUE_MAX_SIZE];
656 struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
657 unsigned int out_num = 0, in_num = 0;
658 int rc;
659
660 if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) {
661 if (le32toh(desc[i].len) % sizeof(struct vring_desc)) {
662 fprintf(stderr, "Invalid size for indirect buffer table\n");
663 return NULL;
664 }
665
666 /* loop over the indirect descriptor table */
667 desc_addr = le64toh(desc[i].addr);
668 desc_len = le32toh(desc[i].len);
669 max = desc_len / sizeof(struct vring_desc);
670 read_len = desc_len;
671 desc = iova_to_va(dev, &read_len, desc_addr);
672 if (unlikely(desc && read_len != desc_len)) {
673 /* Failed to use zero copy */
674 desc = NULL;
675 if (!vduse_queue_read_indirect_desc(dev, desc_buf,
676 desc_addr,
677 desc_len)) {
678 desc = desc_buf;
679 }
680 }
681 if (!desc) {
682 fprintf(stderr, "Invalid indirect buffer table\n");
683 return NULL;
684 }
685 i = 0;
686 }
687
688 /* Collect all the descriptors */
689 do {
690 if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) {
691 if (!vduse_queue_map_single_desc(vq, &in_num, iov + out_num,
692 VIRTQUEUE_MAX_SIZE - out_num,
693 true, le64toh(desc[i].addr),
694 le32toh(desc[i].len))) {
695 return NULL;
696 }
697 } else {
698 if (in_num) {
699 fprintf(stderr, "Incorrect order for descriptors\n");
700 return NULL;
701 }
702 if (!vduse_queue_map_single_desc(vq, &out_num, iov,
703 VIRTQUEUE_MAX_SIZE, false,
704 le64toh(desc[i].addr),
705 le32toh(desc[i].len))) {
706 return NULL;
707 }
708 }
709
710 /* If we've got too many, that implies a descriptor loop. */
711 if ((in_num + out_num) > max) {
712 fprintf(stderr, "Looped descriptor\n");
713 return NULL;
714 }
715 rc = vduse_queue_read_next_desc(desc, i, max, &i);
716 } while (rc == VIRTQUEUE_READ_DESC_MORE);
717
718 if (rc == VIRTQUEUE_READ_DESC_ERROR) {
719 fprintf(stderr, "read descriptor error\n");
720 return NULL;
721 }
722
723 /* Now copy what we have collected and mapped */
724 elem = vduse_queue_alloc_element(sz, out_num, in_num);
725 if (!elem) {
726 fprintf(stderr, "read descriptor error\n");
727 return NULL;
728 }
729 elem->index = idx;
730 for (i = 0; i < out_num; i++) {
731 elem->out_sg[i] = iov[i];
732 }
733 for (i = 0; i < in_num; i++) {
734 elem->in_sg[i] = iov[out_num + i];
735 }
736
737 return elem;
738 }
739
740 void *vduse_queue_pop(VduseVirtq *vq, size_t sz)
741 {
742 unsigned int head;
743 VduseVirtqElement *elem;
744 VduseDev *dev = vq->dev;
745 int i;
746
747 if (unlikely(!vq->vring.avail)) {
748 return NULL;
749 }
750
751 if (unlikely(vq->resubmit_list && vq->resubmit_num > 0)) {
752 i = (--vq->resubmit_num);
753 elem = vduse_queue_map_desc(vq, vq->resubmit_list[i].index, sz);
754
755 if (!vq->resubmit_num) {
756 free(vq->resubmit_list);
757 vq->resubmit_list = NULL;
758 }
759
760 return elem;
761 }
762
763 if (vduse_queue_empty(vq)) {
764 return NULL;
765 }
766 /* Needed after virtio_queue_empty() */
767 smp_rmb();
768
769 if (vq->inuse >= vq->vring.num) {
770 fprintf(stderr, "Virtqueue size exceeded: %d\n", vq->inuse);
771 return NULL;
772 }
773
774 if (!vduse_queue_get_head(vq, vq->last_avail_idx++, &head)) {
775 return NULL;
776 }
777
778 if (vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
779 vring_set_avail_event(vq, vq->last_avail_idx);
780 }
781
782 elem = vduse_queue_map_desc(vq, head, sz);
783
784 if (!elem) {
785 return NULL;
786 }
787
788 vq->inuse++;
789
790 vduse_queue_inflight_get(vq, head);
791
792 return elem;
793 }
794
795 static inline void vring_used_write(VduseVirtq *vq,
796 struct vring_used_elem *uelem, int i)
797 {
798 struct vring_used *used = vq->vring.used;
799
800 used->ring[i] = *uelem;
801 }
802
803 static void vduse_queue_fill(VduseVirtq *vq, const VduseVirtqElement *elem,
804 unsigned int len, unsigned int idx)
805 {
806 struct vring_used_elem uelem;
807
808 if (unlikely(!vq->vring.used)) {
809 return;
810 }
811
812 idx = (idx + vq->used_idx) % vq->vring.num;
813
814 uelem.id = htole32(elem->index);
815 uelem.len = htole32(len);
816 vring_used_write(vq, &uelem, idx);
817 }
818
819 static inline void vring_used_idx_set(VduseVirtq *vq, uint16_t val)
820 {
821 vq->vring.used->idx = htole16(val);
822 vq->used_idx = val;
823 }
824
825 static void vduse_queue_flush(VduseVirtq *vq, unsigned int count)
826 {
827 uint16_t old, new;
828
829 if (unlikely(!vq->vring.used)) {
830 return;
831 }
832
833 /* Make sure buffer is written before we update index. */
834 smp_wmb();
835
836 old = vq->used_idx;
837 new = old + count;
838 vring_used_idx_set(vq, new);
839 vq->inuse -= count;
840 if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) {
841 vq->signalled_used_valid = false;
842 }
843 }
844
845 void vduse_queue_push(VduseVirtq *vq, const VduseVirtqElement *elem,
846 unsigned int len)
847 {
848 vduse_queue_fill(vq, elem, len, 0);
849 vduse_queue_inflight_pre_put(vq, elem->index);
850 vduse_queue_flush(vq, 1);
851 vduse_queue_inflight_post_put(vq, elem->index);
852 }
853
854 static int vduse_queue_update_vring(VduseVirtq *vq, uint64_t desc_addr,
855 uint64_t avail_addr, uint64_t used_addr)
856 {
857 struct VduseDev *dev = vq->dev;
858 uint64_t len;
859
860 len = sizeof(struct vring_desc);
861 vq->vring.desc = iova_to_va(dev, &len, desc_addr);
862 if (len != sizeof(struct vring_desc)) {
863 return -EINVAL;
864 }
865
866 len = sizeof(struct vring_avail);
867 vq->vring.avail = iova_to_va(dev, &len, avail_addr);
868 if (len != sizeof(struct vring_avail)) {
869 return -EINVAL;
870 }
871
872 len = sizeof(struct vring_used);
873 vq->vring.used = iova_to_va(dev, &len, used_addr);
874 if (len != sizeof(struct vring_used)) {
875 return -EINVAL;
876 }
877
878 if (!vq->vring.desc || !vq->vring.avail || !vq->vring.used) {
879 fprintf(stderr, "Failed to get vq[%d] iova mapping\n", vq->index);
880 return -EINVAL;
881 }
882
883 return 0;
884 }
885
886 static void vduse_queue_enable(VduseVirtq *vq)
887 {
888 struct VduseDev *dev = vq->dev;
889 struct vduse_vq_info vq_info;
890 struct vduse_vq_eventfd vq_eventfd;
891 int fd;
892
893 vq_info.index = vq->index;
894 if (ioctl(dev->fd, VDUSE_VQ_GET_INFO, &vq_info)) {
895 fprintf(stderr, "Failed to get vq[%d] info: %s\n",
896 vq->index, strerror(errno));
897 return;
898 }
899
900 if (!vq_info.ready) {
901 return;
902 }
903
904 vq->vring.num = vq_info.num;
905 vq->vring.desc_addr = vq_info.desc_addr;
906 vq->vring.avail_addr = vq_info.driver_addr;
907 vq->vring.used_addr = vq_info.device_addr;
908
909 if (vduse_queue_update_vring(vq, vq_info.desc_addr,
910 vq_info.driver_addr, vq_info.device_addr)) {
911 fprintf(stderr, "Failed to update vring for vq[%d]\n", vq->index);
912 return;
913 }
914
915 fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
916 if (fd < 0) {
917 fprintf(stderr, "Failed to init eventfd for vq[%d]\n", vq->index);
918 return;
919 }
920
921 vq_eventfd.index = vq->index;
922 vq_eventfd.fd = fd;
923 if (ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &vq_eventfd)) {
924 fprintf(stderr, "Failed to setup kick fd for vq[%d]\n", vq->index);
925 close(fd);
926 return;
927 }
928
929 vq->fd = fd;
930 vq->signalled_used_valid = false;
931 vq->ready = true;
932
933 if (vduse_queue_check_inflights(vq)) {
934 fprintf(stderr, "Failed to check inflights for vq[%d]\n", vq->index);
935 close(fd);
936 return;
937 }
938
939 dev->ops->enable_queue(dev, vq);
940 }
941
942 static void vduse_queue_disable(VduseVirtq *vq)
943 {
944 struct VduseDev *dev = vq->dev;
945 struct vduse_vq_eventfd eventfd;
946
947 if (!vq->ready) {
948 return;
949 }
950
951 dev->ops->disable_queue(dev, vq);
952
953 eventfd.index = vq->index;
954 eventfd.fd = VDUSE_EVENTFD_DEASSIGN;
955 ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &eventfd);
956 close(vq->fd);
957
958 assert(vq->inuse == 0);
959
960 vq->vring.num = 0;
961 vq->vring.desc_addr = 0;
962 vq->vring.avail_addr = 0;
963 vq->vring.used_addr = 0;
964 vq->vring.desc = 0;
965 vq->vring.avail = 0;
966 vq->vring.used = 0;
967 vq->ready = false;
968 vq->fd = -1;
969 }
970
971 static void vduse_dev_start_dataplane(VduseDev *dev)
972 {
973 int i;
974
975 if (ioctl(dev->fd, VDUSE_DEV_GET_FEATURES, &dev->features)) {
976 fprintf(stderr, "Failed to get features: %s\n", strerror(errno));
977 return;
978 }
979 assert(vduse_dev_has_feature(dev, VIRTIO_F_VERSION_1));
980
981 for (i = 0; i < dev->num_queues; i++) {
982 vduse_queue_enable(&dev->vqs[i]);
983 }
984 }
985
986 static void vduse_dev_stop_dataplane(VduseDev *dev)
987 {
988 size_t log_size = dev->num_queues * vduse_vq_log_size(VIRTQUEUE_MAX_SIZE);
989 int i;
990
991 for (i = 0; i < dev->num_queues; i++) {
992 vduse_queue_disable(&dev->vqs[i]);
993 }
994 if (dev->log) {
995 memset(dev->log, 0, log_size);
996 }
997 dev->features = 0;
998 vduse_iova_remove_region(dev, 0, ULONG_MAX);
999 }
1000
1001 int vduse_dev_handler(VduseDev *dev)
1002 {
1003 struct vduse_dev_request req;
1004 struct vduse_dev_response resp = { 0 };
1005 VduseVirtq *vq;
1006 int i, ret;
1007
1008 ret = read(dev->fd, &req, sizeof(req));
1009 if (ret != sizeof(req)) {
1010 fprintf(stderr, "Read request error [%d]: %s\n",
1011 ret, strerror(errno));
1012 return -errno;
1013 }
1014 resp.request_id = req.request_id;
1015
1016 switch (req.type) {
1017 case VDUSE_GET_VQ_STATE:
1018 vq = &dev->vqs[req.vq_state.index];
1019 resp.vq_state.split.avail_index = vq->last_avail_idx;
1020 resp.result = VDUSE_REQ_RESULT_OK;
1021 break;
1022 case VDUSE_SET_STATUS:
1023 if (req.s.status & VIRTIO_CONFIG_S_DRIVER_OK) {
1024 vduse_dev_start_dataplane(dev);
1025 } else if (req.s.status == 0) {
1026 vduse_dev_stop_dataplane(dev);
1027 }
1028 resp.result = VDUSE_REQ_RESULT_OK;
1029 break;
1030 case VDUSE_UPDATE_IOTLB:
1031 /* The iova will be updated by iova_to_va() later, so just remove it */
1032 vduse_iova_remove_region(dev, req.iova.start, req.iova.last);
1033 for (i = 0; i < dev->num_queues; i++) {
1034 VduseVirtq *vq = &dev->vqs[i];
1035 if (vq->ready) {
1036 if (vduse_queue_update_vring(vq, vq->vring.desc_addr,
1037 vq->vring.avail_addr,
1038 vq->vring.used_addr)) {
1039 fprintf(stderr, "Failed to update vring for vq[%d]\n",
1040 vq->index);
1041 }
1042 }
1043 }
1044 resp.result = VDUSE_REQ_RESULT_OK;
1045 break;
1046 default:
1047 resp.result = VDUSE_REQ_RESULT_FAILED;
1048 break;
1049 }
1050
1051 ret = write(dev->fd, &resp, sizeof(resp));
1052 if (ret != sizeof(resp)) {
1053 fprintf(stderr, "Write request %d error [%d]: %s\n",
1054 req.type, ret, strerror(errno));
1055 return -errno;
1056 }
1057 return 0;
1058 }
1059
1060 int vduse_dev_update_config(VduseDev *dev, uint32_t size,
1061 uint32_t offset, char *buffer)
1062 {
1063 int ret;
1064 struct vduse_config_data *data;
1065
1066 data = malloc(offsetof(struct vduse_config_data, buffer) + size);
1067 if (!data) {
1068 return -ENOMEM;
1069 }
1070
1071 data->offset = offset;
1072 data->length = size;
1073 memcpy(data->buffer, buffer, size);
1074
1075 ret = ioctl(dev->fd, VDUSE_DEV_SET_CONFIG, data);
1076 free(data);
1077
1078 if (ret) {
1079 return -errno;
1080 }
1081
1082 if (ioctl(dev->fd, VDUSE_DEV_INJECT_CONFIG_IRQ)) {
1083 return -errno;
1084 }
1085
1086 return 0;
1087 }
1088
1089 int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size)
1090 {
1091 VduseVirtq *vq = &dev->vqs[index];
1092 struct vduse_vq_config vq_config = { 0 };
1093
1094 if (max_size > VIRTQUEUE_MAX_SIZE) {
1095 return -EINVAL;
1096 }
1097
1098 vq_config.index = vq->index;
1099 vq_config.max_size = max_size;
1100
1101 if (ioctl(dev->fd, VDUSE_VQ_SETUP, &vq_config)) {
1102 return -errno;
1103 }
1104
1105 vduse_queue_enable(vq);
1106
1107 return 0;
1108 }
1109
1110 int vduse_set_reconnect_log_file(VduseDev *dev, const char *filename)
1111 {
1112
1113 size_t log_size = dev->num_queues * vduse_vq_log_size(VIRTQUEUE_MAX_SIZE);
1114 void *log;
1115 int i;
1116
1117 dev->log = log = vduse_log_get(filename, log_size);
1118 if (log == MAP_FAILED) {
1119 fprintf(stderr, "Failed to get vduse log\n");
1120 return -EINVAL;
1121 }
1122
1123 for (i = 0; i < dev->num_queues; i++) {
1124 dev->vqs[i].log = log;
1125 dev->vqs[i].log->inflight.desc_num = VIRTQUEUE_MAX_SIZE;
1126 log = (void *)((char *)log + vduse_vq_log_size(VIRTQUEUE_MAX_SIZE));
1127 }
1128
1129 return 0;
1130 }
1131
1132 static int vduse_dev_init_vqs(VduseDev *dev, uint16_t num_queues)
1133 {
1134 VduseVirtq *vqs;
1135 int i;
1136
1137 vqs = calloc(sizeof(VduseVirtq), num_queues);
1138 if (!vqs) {
1139 return -ENOMEM;
1140 }
1141
1142 for (i = 0; i < num_queues; i++) {
1143 vqs[i].index = i;
1144 vqs[i].dev = dev;
1145 vqs[i].fd = -1;
1146 }
1147 dev->vqs = vqs;
1148
1149 return 0;
1150 }
1151
1152 static int vduse_dev_init(VduseDev *dev, const char *name,
1153 uint16_t num_queues, const VduseOps *ops,
1154 void *priv)
1155 {
1156 char *dev_path, *dev_name;
1157 int ret, fd;
1158
1159 dev_path = malloc(strlen(name) + strlen("/dev/vduse/") + 1);
1160 if (!dev_path) {
1161 return -ENOMEM;
1162 }
1163 sprintf(dev_path, "/dev/vduse/%s", name);
1164
1165 fd = open(dev_path, O_RDWR);
1166 free(dev_path);
1167 if (fd < 0) {
1168 fprintf(stderr, "Failed to open vduse dev %s: %s\n",
1169 name, strerror(errno));
1170 return -errno;
1171 }
1172
1173 if (ioctl(fd, VDUSE_DEV_GET_FEATURES, &dev->features)) {
1174 fprintf(stderr, "Failed to get features: %s\n", strerror(errno));
1175 close(fd);
1176 return -errno;
1177 }
1178
1179 dev_name = strdup(name);
1180 if (!dev_name) {
1181 close(fd);
1182 return -ENOMEM;
1183 }
1184
1185 ret = vduse_dev_init_vqs(dev, num_queues);
1186 if (ret) {
1187 free(dev_name);
1188 close(fd);
1189 return ret;
1190 }
1191
1192 dev->name = dev_name;
1193 dev->num_queues = num_queues;
1194 dev->fd = fd;
1195 dev->ops = ops;
1196 dev->priv = priv;
1197
1198 return 0;
1199 }
1200
1201 static inline bool vduse_name_is_invalid(const char *name)
1202 {
1203 return strlen(name) >= VDUSE_NAME_MAX || strstr(name, "..");
1204 }
1205
1206 VduseDev *vduse_dev_create_by_fd(int fd, uint16_t num_queues,
1207 const VduseOps *ops, void *priv)
1208 {
1209 VduseDev *dev;
1210 int ret;
1211
1212 if (!ops || !ops->enable_queue || !ops->disable_queue) {
1213 fprintf(stderr, "Invalid parameter for vduse\n");
1214 return NULL;
1215 }
1216
1217 dev = calloc(sizeof(VduseDev), 1);
1218 if (!dev) {
1219 fprintf(stderr, "Failed to allocate vduse device\n");
1220 return NULL;
1221 }
1222
1223 if (ioctl(fd, VDUSE_DEV_GET_FEATURES, &dev->features)) {
1224 fprintf(stderr, "Failed to get features: %s\n", strerror(errno));
1225 free(dev);
1226 return NULL;
1227 }
1228
1229 ret = vduse_dev_init_vqs(dev, num_queues);
1230 if (ret) {
1231 fprintf(stderr, "Failed to init vqs\n");
1232 free(dev);
1233 return NULL;
1234 }
1235
1236 dev->num_queues = num_queues;
1237 dev->fd = fd;
1238 dev->ops = ops;
1239 dev->priv = priv;
1240
1241 return dev;
1242 }
1243
1244 VduseDev *vduse_dev_create_by_name(const char *name, uint16_t num_queues,
1245 const VduseOps *ops, void *priv)
1246 {
1247 VduseDev *dev;
1248 int ret;
1249
1250 if (!name || vduse_name_is_invalid(name) || !ops ||
1251 !ops->enable_queue || !ops->disable_queue) {
1252 fprintf(stderr, "Invalid parameter for vduse\n");
1253 return NULL;
1254 }
1255
1256 dev = calloc(sizeof(VduseDev), 1);
1257 if (!dev) {
1258 fprintf(stderr, "Failed to allocate vduse device\n");
1259 return NULL;
1260 }
1261
1262 ret = vduse_dev_init(dev, name, num_queues, ops, priv);
1263 if (ret < 0) {
1264 fprintf(stderr, "Failed to init vduse device %s: %s\n",
1265 name, strerror(-ret));
1266 free(dev);
1267 return NULL;
1268 }
1269
1270 return dev;
1271 }
1272
1273 VduseDev *vduse_dev_create(const char *name, uint32_t device_id,
1274 uint32_t vendor_id, uint64_t features,
1275 uint16_t num_queues, uint32_t config_size,
1276 char *config, const VduseOps *ops, void *priv)
1277 {
1278 VduseDev *dev;
1279 int ret, ctrl_fd;
1280 uint64_t version;
1281 struct vduse_dev_config *dev_config;
1282 size_t size = offsetof(struct vduse_dev_config, config);
1283
1284 if (!name || vduse_name_is_invalid(name) ||
1285 !has_feature(features, VIRTIO_F_VERSION_1) || !config ||
1286 !config_size || !ops || !ops->enable_queue || !ops->disable_queue) {
1287 fprintf(stderr, "Invalid parameter for vduse\n");
1288 return NULL;
1289 }
1290
1291 dev = calloc(sizeof(VduseDev), 1);
1292 if (!dev) {
1293 fprintf(stderr, "Failed to allocate vduse device\n");
1294 return NULL;
1295 }
1296
1297 ctrl_fd = open("/dev/vduse/control", O_RDWR);
1298 if (ctrl_fd < 0) {
1299 fprintf(stderr, "Failed to open /dev/vduse/control: %s\n",
1300 strerror(errno));
1301 goto err_ctrl;
1302 }
1303
1304 version = VDUSE_API_VERSION;
1305 if (ioctl(ctrl_fd, VDUSE_SET_API_VERSION, &version)) {
1306 fprintf(stderr, "Failed to set api version %" PRIu64 ": %s\n",
1307 version, strerror(errno));
1308 goto err_dev;
1309 }
1310
1311 dev_config = calloc(size + config_size, 1);
1312 if (!dev_config) {
1313 fprintf(stderr, "Failed to allocate config space\n");
1314 goto err_dev;
1315 }
1316
1317 assert(!vduse_name_is_invalid(name));
1318 strcpy(dev_config->name, name);
1319 dev_config->device_id = device_id;
1320 dev_config->vendor_id = vendor_id;
1321 dev_config->features = features;
1322 dev_config->vq_num = num_queues;
1323 dev_config->vq_align = VDUSE_VQ_ALIGN;
1324 dev_config->config_size = config_size;
1325 memcpy(dev_config->config, config, config_size);
1326
1327 ret = ioctl(ctrl_fd, VDUSE_CREATE_DEV, dev_config);
1328 free(dev_config);
1329 if (ret && errno != EEXIST) {
1330 fprintf(stderr, "Failed to create vduse device %s: %s\n",
1331 name, strerror(errno));
1332 goto err_dev;
1333 }
1334 dev->ctrl_fd = ctrl_fd;
1335
1336 ret = vduse_dev_init(dev, name, num_queues, ops, priv);
1337 if (ret < 0) {
1338 fprintf(stderr, "Failed to init vduse device %s: %s\n",
1339 name, strerror(-ret));
1340 goto err;
1341 }
1342
1343 return dev;
1344 err:
1345 ioctl(ctrl_fd, VDUSE_DESTROY_DEV, name);
1346 err_dev:
1347 close(ctrl_fd);
1348 err_ctrl:
1349 free(dev);
1350
1351 return NULL;
1352 }
1353
1354 int vduse_dev_destroy(VduseDev *dev)
1355 {
1356 size_t log_size = dev->num_queues * vduse_vq_log_size(VIRTQUEUE_MAX_SIZE);
1357 int i, ret = 0;
1358
1359 if (dev->log) {
1360 munmap(dev->log, log_size);
1361 }
1362 for (i = 0; i < dev->num_queues; i++) {
1363 free(dev->vqs[i].resubmit_list);
1364 }
1365 free(dev->vqs);
1366 if (dev->fd >= 0) {
1367 close(dev->fd);
1368 dev->fd = -1;
1369 }
1370 if (dev->ctrl_fd >= 0) {
1371 if (ioctl(dev->ctrl_fd, VDUSE_DESTROY_DEV, dev->name)) {
1372 ret = -errno;
1373 }
1374 close(dev->ctrl_fd);
1375 dev->ctrl_fd = -1;
1376 }
1377 free(dev->name);
1378 free(dev);
1379
1380 return ret;
1381 }