]> git.proxmox.com Git - mirror_qemu.git/blob - tests/vhost-user-bridge.c
784f15f37a322570d246c40f0d832643adaf7988
[mirror_qemu.git] / tests / vhost-user-bridge.c
1 /*
2 * Vhost User Bridge
3 *
4 * Copyright (c) 2015 Red Hat, Inc.
5 *
6 * Authors:
7 * Victor Kaplansky <victork@redhat.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2 or
10 * later. See the COPYING file in the top-level directory.
11 */
12
13 /*
14 * TODO:
15 * - main should get parameters from the command line.
16 * - implement all request handlers. Still not implemented:
17 * vubr_get_queue_num_exec()
18 * vubr_send_rarp_exec()
19 * - test for broken requests and virtqueue.
20 * - implement features defined by Virtio 1.0 spec.
21 * - support mergeable buffers and indirect descriptors.
22 * - implement clean shutdown.
23 * - implement non-blocking writes to UDP backend.
24 * - implement polling strategy.
25 * - implement clean starting/stopping of vq processing
26 * - implement clean starting/stopping of used and buffers
27 * dirty page logging.
28 */
29
30 #define _FILE_OFFSET_BITS 64
31
32 #include <stddef.h>
33 #include <assert.h>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <stdint.h>
37 #include <inttypes.h>
38 #include <string.h>
39 #include <unistd.h>
40 #include <errno.h>
41 #include <sys/types.h>
42 #include <sys/socket.h>
43 #include <sys/un.h>
44 #include <sys/unistd.h>
45 #include <sys/mman.h>
46 #include <sys/eventfd.h>
47 #include <arpa/inet.h>
48
49 #include <linux/vhost.h>
50
51 #include "qemu/atomic.h"
52 #include "standard-headers/linux/virtio_net.h"
53 #include "standard-headers/linux/virtio_ring.h"
54
55 #define VHOST_USER_BRIDGE_DEBUG 1
56
57 #define DPRINT(...) \
58 do { \
59 if (VHOST_USER_BRIDGE_DEBUG) { \
60 printf(__VA_ARGS__); \
61 } \
62 } while (0)
63
64 typedef void (*CallbackFunc)(int sock, void *ctx);
65
66 typedef struct Event {
67 void *ctx;
68 CallbackFunc callback;
69 } Event;
70
71 typedef struct Dispatcher {
72 int max_sock;
73 fd_set fdset;
74 Event events[FD_SETSIZE];
75 } Dispatcher;
76
77 static void
78 vubr_die(const char *s)
79 {
80 perror(s);
81 exit(1);
82 }
83
84 static int
85 dispatcher_init(Dispatcher *dispr)
86 {
87 FD_ZERO(&dispr->fdset);
88 dispr->max_sock = -1;
89 return 0;
90 }
91
92 static int
93 dispatcher_add(Dispatcher *dispr, int sock, void *ctx, CallbackFunc cb)
94 {
95 if (sock >= FD_SETSIZE) {
96 fprintf(stderr,
97 "Error: Failed to add new event. sock %d should be less than %d\n",
98 sock, FD_SETSIZE);
99 return -1;
100 }
101
102 dispr->events[sock].ctx = ctx;
103 dispr->events[sock].callback = cb;
104
105 FD_SET(sock, &dispr->fdset);
106 if (sock > dispr->max_sock) {
107 dispr->max_sock = sock;
108 }
109 DPRINT("Added sock %d for watching. max_sock: %d\n",
110 sock, dispr->max_sock);
111 return 0;
112 }
113
114 #if 0
115 /* dispatcher_remove() is not currently in use but may be useful
116 * in the future. */
117 static int
118 dispatcher_remove(Dispatcher *dispr, int sock)
119 {
120 if (sock >= FD_SETSIZE) {
121 fprintf(stderr,
122 "Error: Failed to remove event. sock %d should be less than %d\n",
123 sock, FD_SETSIZE);
124 return -1;
125 }
126
127 FD_CLR(sock, &dispr->fdset);
128 return 0;
129 }
130 #endif
131
132 /* timeout in us */
133 static int
134 dispatcher_wait(Dispatcher *dispr, uint32_t timeout)
135 {
136 struct timeval tv;
137 tv.tv_sec = timeout / 1000000;
138 tv.tv_usec = timeout % 1000000;
139
140 fd_set fdset = dispr->fdset;
141
142 /* wait until some of sockets become readable. */
143 int rc = select(dispr->max_sock + 1, &fdset, 0, 0, &tv);
144
145 if (rc == -1) {
146 vubr_die("select");
147 }
148
149 /* Timeout */
150 if (rc == 0) {
151 return 0;
152 }
153
154 /* Now call callback for every ready socket. */
155
156 int sock;
157 for (sock = 0; sock < dispr->max_sock + 1; sock++)
158 if (FD_ISSET(sock, &fdset)) {
159 Event *e = &dispr->events[sock];
160 e->callback(sock, e->ctx);
161 }
162
163 return 0;
164 }
165
166 typedef struct VubrVirtq {
167 int call_fd;
168 int kick_fd;
169 uint32_t size;
170 uint16_t last_avail_index;
171 uint16_t last_used_index;
172 struct vring_desc *desc;
173 struct vring_avail *avail;
174 struct vring_used *used;
175 uint64_t log_guest_addr;
176 int enable;
177 } VubrVirtq;
178
179 /* Based on qemu/hw/virtio/vhost-user.c */
180
181 #define VHOST_MEMORY_MAX_NREGIONS 8
182 #define VHOST_USER_F_PROTOCOL_FEATURES 30
183
184 #define VHOST_LOG_PAGE 4096
185
186 enum VhostUserProtocolFeature {
187 VHOST_USER_PROTOCOL_F_MQ = 0,
188 VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1,
189 VHOST_USER_PROTOCOL_F_RARP = 2,
190
191 VHOST_USER_PROTOCOL_F_MAX
192 };
193
194 #define VHOST_USER_PROTOCOL_FEATURE_MASK ((1 << VHOST_USER_PROTOCOL_F_MAX) - 1)
195
196 typedef enum VhostUserRequest {
197 VHOST_USER_NONE = 0,
198 VHOST_USER_GET_FEATURES = 1,
199 VHOST_USER_SET_FEATURES = 2,
200 VHOST_USER_SET_OWNER = 3,
201 VHOST_USER_RESET_OWNER = 4,
202 VHOST_USER_SET_MEM_TABLE = 5,
203 VHOST_USER_SET_LOG_BASE = 6,
204 VHOST_USER_SET_LOG_FD = 7,
205 VHOST_USER_SET_VRING_NUM = 8,
206 VHOST_USER_SET_VRING_ADDR = 9,
207 VHOST_USER_SET_VRING_BASE = 10,
208 VHOST_USER_GET_VRING_BASE = 11,
209 VHOST_USER_SET_VRING_KICK = 12,
210 VHOST_USER_SET_VRING_CALL = 13,
211 VHOST_USER_SET_VRING_ERR = 14,
212 VHOST_USER_GET_PROTOCOL_FEATURES = 15,
213 VHOST_USER_SET_PROTOCOL_FEATURES = 16,
214 VHOST_USER_GET_QUEUE_NUM = 17,
215 VHOST_USER_SET_VRING_ENABLE = 18,
216 VHOST_USER_SEND_RARP = 19,
217 VHOST_USER_MAX
218 } VhostUserRequest;
219
220 typedef struct VhostUserMemoryRegion {
221 uint64_t guest_phys_addr;
222 uint64_t memory_size;
223 uint64_t userspace_addr;
224 uint64_t mmap_offset;
225 } VhostUserMemoryRegion;
226
227 typedef struct VhostUserMemory {
228 uint32_t nregions;
229 uint32_t padding;
230 VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
231 } VhostUserMemory;
232
233 typedef struct VhostUserLog {
234 uint64_t mmap_size;
235 uint64_t mmap_offset;
236 } VhostUserLog;
237
238 typedef struct VhostUserMsg {
239 VhostUserRequest request;
240
241 #define VHOST_USER_VERSION_MASK (0x3)
242 #define VHOST_USER_REPLY_MASK (0x1<<2)
243 uint32_t flags;
244 uint32_t size; /* the following payload size */
245 union {
246 #define VHOST_USER_VRING_IDX_MASK (0xff)
247 #define VHOST_USER_VRING_NOFD_MASK (0x1<<8)
248 uint64_t u64;
249 struct vhost_vring_state state;
250 struct vhost_vring_addr addr;
251 VhostUserMemory memory;
252 VhostUserLog log;
253 } payload;
254 int fds[VHOST_MEMORY_MAX_NREGIONS];
255 int fd_num;
256 } QEMU_PACKED VhostUserMsg;
257
258 #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
259
260 /* The version of the protocol we support */
261 #define VHOST_USER_VERSION (0x1)
262
263 #define MAX_NR_VIRTQUEUE (8)
264
265 typedef struct VubrDevRegion {
266 /* Guest Physical address. */
267 uint64_t gpa;
268 /* Memory region size. */
269 uint64_t size;
270 /* QEMU virtual address (userspace). */
271 uint64_t qva;
272 /* Starting offset in our mmaped space. */
273 uint64_t mmap_offset;
274 /* Start address of mmaped space. */
275 uint64_t mmap_addr;
276 } VubrDevRegion;
277
278 typedef struct VubrDev {
279 int sock;
280 Dispatcher dispatcher;
281 uint32_t nregions;
282 VubrDevRegion regions[VHOST_MEMORY_MAX_NREGIONS];
283 VubrVirtq vq[MAX_NR_VIRTQUEUE];
284 int log_call_fd;
285 uint64_t log_size;
286 uint8_t *log_table;
287 int backend_udp_sock;
288 struct sockaddr_in backend_udp_dest;
289 int ready;
290 uint64_t features;
291 } VubrDev;
292
293 static const char *vubr_request_str[] = {
294 [VHOST_USER_NONE] = "VHOST_USER_NONE",
295 [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
296 [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
297 [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
298 [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
299 [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
300 [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
301 [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
302 [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
303 [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
304 [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
305 [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
306 [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
307 [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
308 [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR",
309 [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES",
310 [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES",
311 [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM",
312 [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE",
313 [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP",
314 [VHOST_USER_MAX] = "VHOST_USER_MAX",
315 };
316
317 static void
318 print_buffer(uint8_t *buf, size_t len)
319 {
320 int i;
321 printf("Raw buffer:\n");
322 for (i = 0; i < len; i++) {
323 if (i % 16 == 0) {
324 printf("\n");
325 }
326 if (i % 4 == 0) {
327 printf(" ");
328 }
329 printf("%02x ", buf[i]);
330 }
331 printf("\n............................................................\n");
332 }
333
334 /* Translate guest physical address to our virtual address. */
335 static uint64_t
336 gpa_to_va(VubrDev *dev, uint64_t guest_addr)
337 {
338 int i;
339
340 /* Find matching memory region. */
341 for (i = 0; i < dev->nregions; i++) {
342 VubrDevRegion *r = &dev->regions[i];
343
344 if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) {
345 return guest_addr - r->gpa + r->mmap_addr + r->mmap_offset;
346 }
347 }
348
349 assert(!"address not found in regions");
350 return 0;
351 }
352
353 /* Translate qemu virtual address to our virtual address. */
354 static uint64_t
355 qva_to_va(VubrDev *dev, uint64_t qemu_addr)
356 {
357 int i;
358
359 /* Find matching memory region. */
360 for (i = 0; i < dev->nregions; i++) {
361 VubrDevRegion *r = &dev->regions[i];
362
363 if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) {
364 return qemu_addr - r->qva + r->mmap_addr + r->mmap_offset;
365 }
366 }
367
368 assert(!"address not found in regions");
369 return 0;
370 }
371
372 static void
373 vubr_message_read(int conn_fd, VhostUserMsg *vmsg)
374 {
375 char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { };
376 struct iovec iov = {
377 .iov_base = (char *)vmsg,
378 .iov_len = VHOST_USER_HDR_SIZE,
379 };
380 struct msghdr msg = {
381 .msg_iov = &iov,
382 .msg_iovlen = 1,
383 .msg_control = control,
384 .msg_controllen = sizeof(control),
385 };
386 size_t fd_size;
387 struct cmsghdr *cmsg;
388 int rc;
389
390 rc = recvmsg(conn_fd, &msg, 0);
391
392 if (rc == 0) {
393 vubr_die("recvmsg");
394 fprintf(stderr, "Peer disconnected.\n");
395 exit(1);
396 }
397 if (rc < 0) {
398 vubr_die("recvmsg");
399 }
400
401 vmsg->fd_num = 0;
402 for (cmsg = CMSG_FIRSTHDR(&msg);
403 cmsg != NULL;
404 cmsg = CMSG_NXTHDR(&msg, cmsg))
405 {
406 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
407 fd_size = cmsg->cmsg_len - CMSG_LEN(0);
408 vmsg->fd_num = fd_size / sizeof(int);
409 memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size);
410 break;
411 }
412 }
413
414 if (vmsg->size > sizeof(vmsg->payload)) {
415 fprintf(stderr,
416 "Error: too big message request: %d, size: vmsg->size: %u, "
417 "while sizeof(vmsg->payload) = %lu\n",
418 vmsg->request, vmsg->size, sizeof(vmsg->payload));
419 exit(1);
420 }
421
422 if (vmsg->size) {
423 rc = read(conn_fd, &vmsg->payload, vmsg->size);
424 if (rc == 0) {
425 vubr_die("recvmsg");
426 fprintf(stderr, "Peer disconnected.\n");
427 exit(1);
428 }
429 if (rc < 0) {
430 vubr_die("recvmsg");
431 }
432
433 assert(rc == vmsg->size);
434 }
435 }
436
437 static void
438 vubr_message_write(int conn_fd, VhostUserMsg *vmsg)
439 {
440 int rc;
441
442 do {
443 rc = write(conn_fd, vmsg, VHOST_USER_HDR_SIZE + vmsg->size);
444 } while (rc < 0 && errno == EINTR);
445
446 if (rc < 0) {
447 vubr_die("write");
448 }
449 }
450
451 static void
452 vubr_backend_udp_sendbuf(VubrDev *dev, uint8_t *buf, size_t len)
453 {
454 int slen = sizeof(struct sockaddr_in);
455
456 if (sendto(dev->backend_udp_sock, buf, len, 0,
457 (struct sockaddr *) &dev->backend_udp_dest, slen) == -1) {
458 vubr_die("sendto()");
459 }
460 }
461
462 static int
463 vubr_backend_udp_recvbuf(VubrDev *dev, uint8_t *buf, size_t buflen)
464 {
465 int slen = sizeof(struct sockaddr_in);
466 int rc;
467
468 rc = recvfrom(dev->backend_udp_sock, buf, buflen, 0,
469 (struct sockaddr *) &dev->backend_udp_dest,
470 (socklen_t *)&slen);
471 if (rc == -1) {
472 vubr_die("recvfrom()");
473 }
474
475 return rc;
476 }
477
478 static void
479 vubr_consume_raw_packet(VubrDev *dev, uint8_t *buf, uint32_t len)
480 {
481 int hdrlen = sizeof(struct virtio_net_hdr_v1);
482
483 if (VHOST_USER_BRIDGE_DEBUG) {
484 print_buffer(buf, len);
485 }
486 vubr_backend_udp_sendbuf(dev, buf + hdrlen, len - hdrlen);
487 }
488
489 /* Kick the log_call_fd if required. */
490 static void
491 vubr_log_kick(VubrDev *dev)
492 {
493 if (dev->log_call_fd != -1) {
494 DPRINT("Kicking the QEMU's log...\n");
495 eventfd_write(dev->log_call_fd, 1);
496 }
497 }
498
499 /* Kick the guest if necessary. */
500 static void
501 vubr_virtqueue_kick(VubrVirtq *vq)
502 {
503 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
504 DPRINT("Kicking the guest...\n");
505 eventfd_write(vq->call_fd, 1);
506 }
507 }
508
509 static void
510 vubr_log_page(uint8_t *log_table, uint64_t page)
511 {
512 DPRINT("Logged dirty guest page: %"PRId64"\n", page);
513 atomic_or(&log_table[page / 8], 1 << (page % 8));
514 }
515
516 static void
517 vubr_log_write(VubrDev *dev, uint64_t address, uint64_t length)
518 {
519 uint64_t page;
520
521 if (!(dev->features & (1ULL << VHOST_F_LOG_ALL)) ||
522 !dev->log_table || !length) {
523 return;
524 }
525
526 assert(dev->log_size > ((address + length - 1) / VHOST_LOG_PAGE / 8));
527
528 page = address / VHOST_LOG_PAGE;
529 while (page * VHOST_LOG_PAGE < address + length) {
530 vubr_log_page(dev->log_table, page);
531 page += VHOST_LOG_PAGE;
532 }
533 vubr_log_kick(dev);
534 }
535
536 static void
537 vubr_post_buffer(VubrDev *dev, VubrVirtq *vq, uint8_t *buf, int32_t len)
538 {
539 struct vring_desc *desc = vq->desc;
540 struct vring_avail *avail = vq->avail;
541 struct vring_used *used = vq->used;
542 uint64_t log_guest_addr = vq->log_guest_addr;
543
544 unsigned int size = vq->size;
545
546 uint16_t avail_index = atomic_mb_read(&avail->idx);
547
548 /* We check the available descriptors before posting the
549 * buffer, so here we assume that enough available
550 * descriptors. */
551 assert(vq->last_avail_index != avail_index);
552 uint16_t a_index = vq->last_avail_index % size;
553 uint16_t u_index = vq->last_used_index % size;
554 uint16_t d_index = avail->ring[a_index];
555
556 int i = d_index;
557
558 DPRINT("Post packet to guest on vq:\n");
559 DPRINT(" size = %d\n", vq->size);
560 DPRINT(" last_avail_index = %d\n", vq->last_avail_index);
561 DPRINT(" last_used_index = %d\n", vq->last_used_index);
562 DPRINT(" a_index = %d\n", a_index);
563 DPRINT(" u_index = %d\n", u_index);
564 DPRINT(" d_index = %d\n", d_index);
565 DPRINT(" desc[%d].addr = 0x%016"PRIx64"\n", i, desc[i].addr);
566 DPRINT(" desc[%d].len = %d\n", i, desc[i].len);
567 DPRINT(" desc[%d].flags = %d\n", i, desc[i].flags);
568 DPRINT(" avail->idx = %d\n", avail_index);
569 DPRINT(" used->idx = %d\n", used->idx);
570
571 if (!(desc[i].flags & VRING_DESC_F_WRITE)) {
572 /* FIXME: we should find writable descriptor. */
573 fprintf(stderr, "Error: descriptor is not writable. Exiting.\n");
574 exit(1);
575 }
576
577 void *chunk_start = (void *)gpa_to_va(dev, desc[i].addr);
578 uint32_t chunk_len = desc[i].len;
579
580 if (len <= chunk_len) {
581 memcpy(chunk_start, buf, len);
582 vubr_log_write(dev, desc[i].addr, len);
583 } else {
584 fprintf(stderr,
585 "Received too long packet from the backend. Dropping...\n");
586 return;
587 }
588
589 /* Add descriptor to the used ring. */
590 used->ring[u_index].id = d_index;
591 used->ring[u_index].len = len;
592 vubr_log_write(dev,
593 log_guest_addr + offsetof(struct vring_used, ring[u_index]),
594 sizeof(used->ring[u_index]));
595
596 vq->last_avail_index++;
597 vq->last_used_index++;
598
599 atomic_mb_set(&used->idx, vq->last_used_index);
600 vubr_log_write(dev,
601 log_guest_addr + offsetof(struct vring_used, idx),
602 sizeof(used->idx));
603
604 /* Kick the guest if necessary. */
605 vubr_virtqueue_kick(vq);
606 }
607
608 static int
609 vubr_process_desc(VubrDev *dev, VubrVirtq *vq)
610 {
611 struct vring_desc *desc = vq->desc;
612 struct vring_avail *avail = vq->avail;
613 struct vring_used *used = vq->used;
614 uint64_t log_guest_addr = vq->log_guest_addr;
615
616 unsigned int size = vq->size;
617
618 uint16_t a_index = vq->last_avail_index % size;
619 uint16_t u_index = vq->last_used_index % size;
620 uint16_t d_index = avail->ring[a_index];
621
622 uint32_t i, len = 0;
623 size_t buf_size = 4096;
624 uint8_t buf[4096];
625
626 DPRINT("Chunks: ");
627 i = d_index;
628 do {
629 void *chunk_start = (void *)gpa_to_va(dev, desc[i].addr);
630 uint32_t chunk_len = desc[i].len;
631
632 assert(!(desc[i].flags & VRING_DESC_F_WRITE));
633
634 if (len + chunk_len < buf_size) {
635 memcpy(buf + len, chunk_start, chunk_len);
636 DPRINT("%d ", chunk_len);
637 } else {
638 fprintf(stderr, "Error: too long packet. Dropping...\n");
639 break;
640 }
641
642 len += chunk_len;
643
644 if (!(desc[i].flags & VRING_DESC_F_NEXT)) {
645 break;
646 }
647
648 i = desc[i].next;
649 } while (1);
650 DPRINT("\n");
651
652 if (!len) {
653 return -1;
654 }
655
656 /* Add descriptor to the used ring. */
657 used->ring[u_index].id = d_index;
658 used->ring[u_index].len = len;
659 vubr_log_write(dev,
660 log_guest_addr + offsetof(struct vring_used, ring[u_index]),
661 sizeof(used->ring[u_index]));
662
663 vubr_consume_raw_packet(dev, buf, len);
664
665 return 0;
666 }
667
668 static void
669 vubr_process_avail(VubrDev *dev, VubrVirtq *vq)
670 {
671 struct vring_avail *avail = vq->avail;
672 struct vring_used *used = vq->used;
673 uint64_t log_guest_addr = vq->log_guest_addr;
674
675 while (vq->last_avail_index != atomic_mb_read(&avail->idx)) {
676 vubr_process_desc(dev, vq);
677 vq->last_avail_index++;
678 vq->last_used_index++;
679 }
680
681 atomic_mb_set(&used->idx, vq->last_used_index);
682 vubr_log_write(dev,
683 log_guest_addr + offsetof(struct vring_used, idx),
684 sizeof(used->idx));
685 }
686
687 static void
688 vubr_backend_recv_cb(int sock, void *ctx)
689 {
690 VubrDev *dev = (VubrDev *) ctx;
691 VubrVirtq *rx_vq = &dev->vq[0];
692 uint8_t buf[4096];
693 struct virtio_net_hdr_v1 *hdr = (struct virtio_net_hdr_v1 *)buf;
694 int hdrlen = sizeof(struct virtio_net_hdr_v1);
695 int buflen = sizeof(buf);
696 int len;
697
698 if (!dev->ready) {
699 return;
700 }
701
702 DPRINT("\n\n *** IN UDP RECEIVE CALLBACK ***\n\n");
703
704 uint16_t avail_index = atomic_mb_read(&rx_vq->avail->idx);
705
706 /* If there is no available descriptors, just do nothing.
707 * The buffer will be handled by next arrived UDP packet,
708 * or next kick on receive virtq. */
709 if (rx_vq->last_avail_index == avail_index) {
710 DPRINT("Got UDP packet, but no available descriptors on RX virtq.\n");
711 return;
712 }
713
714 len = vubr_backend_udp_recvbuf(dev, buf + hdrlen, buflen - hdrlen);
715
716 *hdr = (struct virtio_net_hdr_v1) { };
717 hdr->num_buffers = 1;
718 vubr_post_buffer(dev, rx_vq, buf, len + hdrlen);
719 }
720
721 static void
722 vubr_kick_cb(int sock, void *ctx)
723 {
724 VubrDev *dev = (VubrDev *) ctx;
725 eventfd_t kick_data;
726 ssize_t rc;
727
728 rc = eventfd_read(sock, &kick_data);
729 if (rc == -1) {
730 vubr_die("eventfd_read()");
731 } else {
732 DPRINT("Got kick_data: %016"PRIx64"\n", kick_data);
733 vubr_process_avail(dev, &dev->vq[1]);
734 }
735 }
736
737 static int
738 vubr_none_exec(VubrDev *dev, VhostUserMsg *vmsg)
739 {
740 DPRINT("Function %s() not implemented yet.\n", __func__);
741 return 0;
742 }
743
744 static int
745 vubr_get_features_exec(VubrDev *dev, VhostUserMsg *vmsg)
746 {
747 vmsg->payload.u64 =
748 ((1ULL << VIRTIO_NET_F_MRG_RXBUF) |
749 (1ULL << VHOST_F_LOG_ALL) |
750 (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
751 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES));
752
753 vmsg->size = sizeof(vmsg->payload.u64);
754
755 DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
756
757 /* Reply */
758 return 1;
759 }
760
761 static int
762 vubr_set_features_exec(VubrDev *dev, VhostUserMsg *vmsg)
763 {
764 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
765 dev->features = vmsg->payload.u64;
766 return 0;
767 }
768
769 static int
770 vubr_set_owner_exec(VubrDev *dev, VhostUserMsg *vmsg)
771 {
772 return 0;
773 }
774
775 static void
776 vubr_close_log(VubrDev *dev)
777 {
778 if (dev->log_table) {
779 if (munmap(dev->log_table, dev->log_size) != 0) {
780 vubr_die("munmap()");
781 }
782
783 dev->log_table = 0;
784 }
785 if (dev->log_call_fd != -1) {
786 close(dev->log_call_fd);
787 dev->log_call_fd = -1;
788 }
789 }
790
791 static int
792 vubr_reset_device_exec(VubrDev *dev, VhostUserMsg *vmsg)
793 {
794 vubr_close_log(dev);
795 dev->ready = 0;
796 dev->features = 0;
797 return 0;
798 }
799
800 static int
801 vubr_set_mem_table_exec(VubrDev *dev, VhostUserMsg *vmsg)
802 {
803 int i;
804 VhostUserMemory *memory = &vmsg->payload.memory;
805 dev->nregions = memory->nregions;
806
807 DPRINT("Nregions: %d\n", memory->nregions);
808 for (i = 0; i < dev->nregions; i++) {
809 void *mmap_addr;
810 VhostUserMemoryRegion *msg_region = &memory->regions[i];
811 VubrDevRegion *dev_region = &dev->regions[i];
812
813 DPRINT("Region %d\n", i);
814 DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n",
815 msg_region->guest_phys_addr);
816 DPRINT(" memory_size: 0x%016"PRIx64"\n",
817 msg_region->memory_size);
818 DPRINT(" userspace_addr 0x%016"PRIx64"\n",
819 msg_region->userspace_addr);
820 DPRINT(" mmap_offset 0x%016"PRIx64"\n",
821 msg_region->mmap_offset);
822
823 dev_region->gpa = msg_region->guest_phys_addr;
824 dev_region->size = msg_region->memory_size;
825 dev_region->qva = msg_region->userspace_addr;
826 dev_region->mmap_offset = msg_region->mmap_offset;
827
828 /* We don't use offset argument of mmap() since the
829 * mapped address has to be page aligned, and we use huge
830 * pages. */
831 mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
832 PROT_READ | PROT_WRITE, MAP_SHARED,
833 vmsg->fds[i], 0);
834
835 if (mmap_addr == MAP_FAILED) {
836 vubr_die("mmap");
837 }
838
839 dev_region->mmap_addr = (uint64_t) mmap_addr;
840 DPRINT(" mmap_addr: 0x%016"PRIx64"\n", dev_region->mmap_addr);
841 }
842
843 return 0;
844 }
845
846 static int
847 vubr_set_log_base_exec(VubrDev *dev, VhostUserMsg *vmsg)
848 {
849 int fd;
850 uint64_t log_mmap_size, log_mmap_offset;
851 void *rc;
852
853 assert(vmsg->fd_num == 1);
854 fd = vmsg->fds[0];
855
856 assert(vmsg->size == sizeof(vmsg->payload.log));
857 log_mmap_offset = vmsg->payload.log.mmap_offset;
858 log_mmap_size = vmsg->payload.log.mmap_size;
859 DPRINT("Log mmap_offset: %"PRId64"\n", log_mmap_offset);
860 DPRINT("Log mmap_size: %"PRId64"\n", log_mmap_size);
861
862 rc = mmap(0, log_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd,
863 log_mmap_offset);
864 if (rc == MAP_FAILED) {
865 vubr_die("mmap");
866 }
867 dev->log_table = rc;
868 dev->log_size = log_mmap_size;
869
870 vmsg->size = sizeof(vmsg->payload.u64);
871 /* Reply */
872 return 1;
873 }
874
875 static int
876 vubr_set_log_fd_exec(VubrDev *dev, VhostUserMsg *vmsg)
877 {
878 assert(vmsg->fd_num == 1);
879 dev->log_call_fd = vmsg->fds[0];
880 DPRINT("Got log_call_fd: %d\n", vmsg->fds[0]);
881 return 0;
882 }
883
884 static int
885 vubr_set_vring_num_exec(VubrDev *dev, VhostUserMsg *vmsg)
886 {
887 unsigned int index = vmsg->payload.state.index;
888 unsigned int num = vmsg->payload.state.num;
889
890 DPRINT("State.index: %d\n", index);
891 DPRINT("State.num: %d\n", num);
892 dev->vq[index].size = num;
893 return 0;
894 }
895
896 static int
897 vubr_set_vring_addr_exec(VubrDev *dev, VhostUserMsg *vmsg)
898 {
899 struct vhost_vring_addr *vra = &vmsg->payload.addr;
900 unsigned int index = vra->index;
901 VubrVirtq *vq = &dev->vq[index];
902
903 DPRINT("vhost_vring_addr:\n");
904 DPRINT(" index: %d\n", vra->index);
905 DPRINT(" flags: %d\n", vra->flags);
906 DPRINT(" desc_user_addr: 0x%016llx\n", vra->desc_user_addr);
907 DPRINT(" used_user_addr: 0x%016llx\n", vra->used_user_addr);
908 DPRINT(" avail_user_addr: 0x%016llx\n", vra->avail_user_addr);
909 DPRINT(" log_guest_addr: 0x%016llx\n", vra->log_guest_addr);
910
911 vq->desc = (struct vring_desc *)qva_to_va(dev, vra->desc_user_addr);
912 vq->used = (struct vring_used *)qva_to_va(dev, vra->used_user_addr);
913 vq->avail = (struct vring_avail *)qva_to_va(dev, vra->avail_user_addr);
914 vq->log_guest_addr = vra->log_guest_addr;
915
916 DPRINT("Setting virtq addresses:\n");
917 DPRINT(" vring_desc at %p\n", vq->desc);
918 DPRINT(" vring_used at %p\n", vq->used);
919 DPRINT(" vring_avail at %p\n", vq->avail);
920
921 vq->last_used_index = vq->used->idx;
922 return 0;
923 }
924
925 static int
926 vubr_set_vring_base_exec(VubrDev *dev, VhostUserMsg *vmsg)
927 {
928 unsigned int index = vmsg->payload.state.index;
929 unsigned int num = vmsg->payload.state.num;
930
931 DPRINT("State.index: %d\n", index);
932 DPRINT("State.num: %d\n", num);
933 dev->vq[index].last_avail_index = num;
934
935 return 0;
936 }
937
938 static int
939 vubr_get_vring_base_exec(VubrDev *dev, VhostUserMsg *vmsg)
940 {
941 unsigned int index = vmsg->payload.state.index;
942
943 DPRINT("State.index: %d\n", index);
944 vmsg->payload.state.num = dev->vq[index].last_avail_index;
945 vmsg->size = sizeof(vmsg->payload.state);
946 /* FIXME: this is a work-around for a bug in QEMU enabling
947 * too early vrings. When protocol features are enabled,
948 * we have to respect * VHOST_USER_SET_VRING_ENABLE request. */
949 dev->ready = 0;
950
951 /* Reply */
952 return 1;
953 }
954
955 static int
956 vubr_set_vring_kick_exec(VubrDev *dev, VhostUserMsg *vmsg)
957 {
958 uint64_t u64_arg = vmsg->payload.u64;
959 int index = u64_arg & VHOST_USER_VRING_IDX_MASK;
960
961 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
962
963 assert((u64_arg & VHOST_USER_VRING_NOFD_MASK) == 0);
964 assert(vmsg->fd_num == 1);
965
966 dev->vq[index].kick_fd = vmsg->fds[0];
967 DPRINT("Got kick_fd: %d for vq: %d\n", vmsg->fds[0], index);
968
969 if (index % 2 == 1) {
970 /* TX queue. */
971 dispatcher_add(&dev->dispatcher, dev->vq[index].kick_fd,
972 dev, vubr_kick_cb);
973
974 DPRINT("Waiting for kicks on fd: %d for vq: %d\n",
975 dev->vq[index].kick_fd, index);
976 }
977 /* We temporarily use this hack to determine that both TX and RX
978 * queues are set up and ready for processing.
979 * FIXME: we need to rely in VHOST_USER_SET_VRING_ENABLE and
980 * actual kicks. */
981 if (dev->vq[0].kick_fd != -1 &&
982 dev->vq[1].kick_fd != -1) {
983 dev->ready = 1;
984 DPRINT("vhost-user-bridge is ready for processing queues.\n");
985 }
986 return 0;
987
988 }
989
990 static int
991 vubr_set_vring_call_exec(VubrDev *dev, VhostUserMsg *vmsg)
992 {
993 uint64_t u64_arg = vmsg->payload.u64;
994 int index = u64_arg & VHOST_USER_VRING_IDX_MASK;
995
996 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
997 assert((u64_arg & VHOST_USER_VRING_NOFD_MASK) == 0);
998 assert(vmsg->fd_num == 1);
999
1000 dev->vq[index].call_fd = vmsg->fds[0];
1001 DPRINT("Got call_fd: %d for vq: %d\n", vmsg->fds[0], index);
1002
1003 return 0;
1004 }
1005
1006 static int
1007 vubr_set_vring_err_exec(VubrDev *dev, VhostUserMsg *vmsg)
1008 {
1009 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
1010 return 0;
1011 }
1012
1013 static int
1014 vubr_get_protocol_features_exec(VubrDev *dev, VhostUserMsg *vmsg)
1015 {
1016 vmsg->payload.u64 = 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD;
1017 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
1018 vmsg->size = sizeof(vmsg->payload.u64);
1019
1020 /* Reply */
1021 return 1;
1022 }
1023
1024 static int
1025 vubr_set_protocol_features_exec(VubrDev *dev, VhostUserMsg *vmsg)
1026 {
1027 /* FIXME: unimplented */
1028 DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
1029 return 0;
1030 }
1031
1032 static int
1033 vubr_get_queue_num_exec(VubrDev *dev, VhostUserMsg *vmsg)
1034 {
1035 DPRINT("Function %s() not implemented yet.\n", __func__);
1036 return 0;
1037 }
1038
1039 static int
1040 vubr_set_vring_enable_exec(VubrDev *dev, VhostUserMsg *vmsg)
1041 {
1042 unsigned int index = vmsg->payload.state.index;
1043 unsigned int enable = vmsg->payload.state.num;
1044
1045 DPRINT("State.index: %d\n", index);
1046 DPRINT("State.enable: %d\n", enable);
1047 dev->vq[index].enable = enable;
1048 return 0;
1049 }
1050
1051 static int
1052 vubr_send_rarp_exec(VubrDev *dev, VhostUserMsg *vmsg)
1053 {
1054 DPRINT("Function %s() not implemented yet.\n", __func__);
1055 return 0;
1056 }
1057
1058 static int
1059 vubr_execute_request(VubrDev *dev, VhostUserMsg *vmsg)
1060 {
1061 /* Print out generic part of the request. */
1062 DPRINT(
1063 "================== Vhost user message from QEMU ==================\n");
1064 DPRINT("Request: %s (%d)\n", vubr_request_str[vmsg->request],
1065 vmsg->request);
1066 DPRINT("Flags: 0x%x\n", vmsg->flags);
1067 DPRINT("Size: %d\n", vmsg->size);
1068
1069 if (vmsg->fd_num) {
1070 int i;
1071 DPRINT("Fds:");
1072 for (i = 0; i < vmsg->fd_num; i++) {
1073 DPRINT(" %d", vmsg->fds[i]);
1074 }
1075 DPRINT("\n");
1076 }
1077
1078 switch (vmsg->request) {
1079 case VHOST_USER_NONE:
1080 return vubr_none_exec(dev, vmsg);
1081 case VHOST_USER_GET_FEATURES:
1082 return vubr_get_features_exec(dev, vmsg);
1083 case VHOST_USER_SET_FEATURES:
1084 return vubr_set_features_exec(dev, vmsg);
1085 case VHOST_USER_SET_OWNER:
1086 return vubr_set_owner_exec(dev, vmsg);
1087 case VHOST_USER_RESET_OWNER:
1088 return vubr_reset_device_exec(dev, vmsg);
1089 case VHOST_USER_SET_MEM_TABLE:
1090 return vubr_set_mem_table_exec(dev, vmsg);
1091 case VHOST_USER_SET_LOG_BASE:
1092 return vubr_set_log_base_exec(dev, vmsg);
1093 case VHOST_USER_SET_LOG_FD:
1094 return vubr_set_log_fd_exec(dev, vmsg);
1095 case VHOST_USER_SET_VRING_NUM:
1096 return vubr_set_vring_num_exec(dev, vmsg);
1097 case VHOST_USER_SET_VRING_ADDR:
1098 return vubr_set_vring_addr_exec(dev, vmsg);
1099 case VHOST_USER_SET_VRING_BASE:
1100 return vubr_set_vring_base_exec(dev, vmsg);
1101 case VHOST_USER_GET_VRING_BASE:
1102 return vubr_get_vring_base_exec(dev, vmsg);
1103 case VHOST_USER_SET_VRING_KICK:
1104 return vubr_set_vring_kick_exec(dev, vmsg);
1105 case VHOST_USER_SET_VRING_CALL:
1106 return vubr_set_vring_call_exec(dev, vmsg);
1107 case VHOST_USER_SET_VRING_ERR:
1108 return vubr_set_vring_err_exec(dev, vmsg);
1109 case VHOST_USER_GET_PROTOCOL_FEATURES:
1110 return vubr_get_protocol_features_exec(dev, vmsg);
1111 case VHOST_USER_SET_PROTOCOL_FEATURES:
1112 return vubr_set_protocol_features_exec(dev, vmsg);
1113 case VHOST_USER_GET_QUEUE_NUM:
1114 return vubr_get_queue_num_exec(dev, vmsg);
1115 case VHOST_USER_SET_VRING_ENABLE:
1116 return vubr_set_vring_enable_exec(dev, vmsg);
1117 case VHOST_USER_SEND_RARP:
1118 return vubr_send_rarp_exec(dev, vmsg);
1119
1120 case VHOST_USER_MAX:
1121 assert(vmsg->request != VHOST_USER_MAX);
1122 }
1123 return 0;
1124 }
1125
1126 static void
1127 vubr_receive_cb(int sock, void *ctx)
1128 {
1129 VubrDev *dev = (VubrDev *) ctx;
1130 VhostUserMsg vmsg;
1131 int reply_requested;
1132
1133 vubr_message_read(sock, &vmsg);
1134 reply_requested = vubr_execute_request(dev, &vmsg);
1135 if (reply_requested) {
1136 /* Set the version in the flags when sending the reply */
1137 vmsg.flags &= ~VHOST_USER_VERSION_MASK;
1138 vmsg.flags |= VHOST_USER_VERSION;
1139 vmsg.flags |= VHOST_USER_REPLY_MASK;
1140 vubr_message_write(sock, &vmsg);
1141 }
1142 }
1143
1144 static void
1145 vubr_accept_cb(int sock, void *ctx)
1146 {
1147 VubrDev *dev = (VubrDev *)ctx;
1148 int conn_fd;
1149 struct sockaddr_un un;
1150 socklen_t len = sizeof(un);
1151
1152 conn_fd = accept(sock, (struct sockaddr *) &un, &len);
1153 if (conn_fd == -1) {
1154 vubr_die("accept()");
1155 }
1156 DPRINT("Got connection from remote peer on sock %d\n", conn_fd);
1157 dispatcher_add(&dev->dispatcher, conn_fd, ctx, vubr_receive_cb);
1158 }
1159
1160 static VubrDev *
1161 vubr_new(const char *path)
1162 {
1163 VubrDev *dev = (VubrDev *) calloc(1, sizeof(VubrDev));
1164 dev->nregions = 0;
1165 int i;
1166 struct sockaddr_un un;
1167 size_t len;
1168
1169 for (i = 0; i < MAX_NR_VIRTQUEUE; i++) {
1170 dev->vq[i] = (VubrVirtq) {
1171 .call_fd = -1, .kick_fd = -1,
1172 .size = 0,
1173 .last_avail_index = 0, .last_used_index = 0,
1174 .desc = 0, .avail = 0, .used = 0,
1175 .enable = 0,
1176 };
1177 }
1178
1179 /* Init log */
1180 dev->log_call_fd = -1;
1181 dev->log_size = 0;
1182 dev->log_table = 0;
1183 dev->ready = 0;
1184 dev->features = 0;
1185
1186 /* Get a UNIX socket. */
1187 dev->sock = socket(AF_UNIX, SOCK_STREAM, 0);
1188 if (dev->sock == -1) {
1189 vubr_die("socket");
1190 }
1191
1192 un.sun_family = AF_UNIX;
1193 strcpy(un.sun_path, path);
1194 len = sizeof(un.sun_family) + strlen(path);
1195 unlink(path);
1196
1197 if (bind(dev->sock, (struct sockaddr *) &un, len) == -1) {
1198 vubr_die("bind");
1199 }
1200
1201 if (listen(dev->sock, 1) == -1) {
1202 vubr_die("listen");
1203 }
1204
1205 dispatcher_init(&dev->dispatcher);
1206 dispatcher_add(&dev->dispatcher, dev->sock, (void *)dev,
1207 vubr_accept_cb);
1208
1209 DPRINT("Waiting for connections on UNIX socket %s ...\n", path);
1210 return dev;
1211 }
1212
1213 static void
1214 vubr_backend_udp_setup(VubrDev *dev,
1215 const char *local_host,
1216 uint16_t local_port,
1217 const char *dest_host,
1218 uint16_t dest_port)
1219 {
1220 int sock;
1221 struct sockaddr_in si_local = {
1222 .sin_family = AF_INET,
1223 .sin_port = htons(local_port),
1224 };
1225
1226 if (inet_aton(local_host, &si_local.sin_addr) == 0) {
1227 fprintf(stderr, "inet_aton() failed.\n");
1228 exit(1);
1229 }
1230
1231 /* setup destination for sends */
1232 dev->backend_udp_dest = (struct sockaddr_in) {
1233 .sin_family = AF_INET,
1234 .sin_port = htons(dest_port),
1235 };
1236 if (inet_aton(dest_host, &dev->backend_udp_dest.sin_addr) == 0) {
1237 fprintf(stderr, "inet_aton() failed.\n");
1238 exit(1);
1239 }
1240
1241 sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
1242 if (sock == -1) {
1243 vubr_die("socket");
1244 }
1245
1246 if (bind(sock, (struct sockaddr *)&si_local, sizeof(si_local)) == -1) {
1247 vubr_die("bind");
1248 }
1249
1250 dev->backend_udp_sock = sock;
1251 dispatcher_add(&dev->dispatcher, sock, dev, vubr_backend_recv_cb);
1252 DPRINT("Waiting for data from udp backend on %s:%d...\n",
1253 local_host, local_port);
1254 }
1255
1256 static void
1257 vubr_run(VubrDev *dev)
1258 {
1259 while (1) {
1260 /* timeout 200ms */
1261 dispatcher_wait(&dev->dispatcher, 200000);
1262 /* Here one can try polling strategy. */
1263 }
1264 }
1265
1266 int
1267 main(int argc, char *argv[])
1268 {
1269 VubrDev *dev;
1270
1271 dev = vubr_new("/tmp/vubr.sock");
1272 if (!dev) {
1273 return 1;
1274 }
1275
1276 vubr_backend_udp_setup(dev,
1277 "127.0.0.1", 4444,
1278 "127.0.0.1", 5555);
1279 vubr_run(dev);
1280 return 0;
1281 }