]>
Commit | Line | Data |
---|---|---|
7b2e5c65 MAL |
1 | /* |
2 | * Vhost User library | |
3 | * | |
4 | * Copyright IBM, Corp. 2007 | |
5 | * Copyright (c) 2016 Red Hat, Inc. | |
6 | * | |
7 | * Authors: | |
8 | * Anthony Liguori <aliguori@us.ibm.com> | |
9 | * Marc-André Lureau <mlureau@redhat.com> | |
10 | * Victor Kaplansky <victork@redhat.com> | |
11 | * | |
12 | * This work is licensed under the terms of the GNU GPL, version 2 or | |
13 | * later. See the COPYING file in the top-level directory. | |
14 | */ | |
15 | ||
eb078a9f MAL |
16 | /* this code avoids GLib dependency */ |
17 | #include <stdlib.h> | |
18 | #include <stdio.h> | |
19 | #include <unistd.h> | |
20 | #include <stdarg.h> | |
21 | #include <errno.h> | |
22 | #include <string.h> | |
23 | #include <assert.h> | |
24 | #include <inttypes.h> | |
25 | #include <sys/types.h> | |
26 | #include <sys/socket.h> | |
7b2e5c65 | 27 | #include <sys/eventfd.h> |
eb078a9f | 28 | #include <sys/mman.h> |
2a84ffc0 DDAG |
29 | #include "qemu/compiler.h" |
30 | ||
31 | #if defined(__linux__) | |
32 | #include <sys/syscall.h> | |
33 | #include <fcntl.h> | |
34 | #include <sys/ioctl.h> | |
7b2e5c65 MAL |
35 | #include <linux/vhost.h> |
36 | ||
2a84ffc0 DDAG |
37 | #ifdef __NR_userfaultfd |
38 | #include <linux/userfaultfd.h> | |
39 | #endif | |
40 | ||
41 | #endif | |
42 | ||
7b2e5c65 MAL |
43 | #include "qemu/atomic.h" |
44 | ||
45 | #include "libvhost-user.h" | |
46 | ||
eb078a9f MAL |
47 | /* usually provided by GLib */ |
48 | #ifndef MIN | |
49 | #define MIN(x, y) ({ \ | |
50 | typeof(x) _min1 = (x); \ | |
51 | typeof(y) _min2 = (y); \ | |
52 | (void) (&_min1 == &_min2); \ | |
53 | _min1 < _min2 ? _min1 : _min2; }) | |
54 | #endif | |
55 | ||
7b2e5c65 MAL |
56 | #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64) |
57 | ||
58 | /* The version of the protocol we support */ | |
59 | #define VHOST_USER_VERSION 1 | |
60 | #define LIBVHOST_USER_DEBUG 0 | |
61 | ||
62 | #define DPRINT(...) \ | |
63 | do { \ | |
64 | if (LIBVHOST_USER_DEBUG) { \ | |
65 | fprintf(stderr, __VA_ARGS__); \ | |
66 | } \ | |
67 | } while (0) | |
68 | ||
69 | static const char * | |
ea642e22 | 70 | vu_request_to_string(unsigned int req) |
7b2e5c65 MAL |
71 | { |
72 | #define REQ(req) [req] = #req | |
73 | static const char *vu_request_str[] = { | |
7b2e5c65 MAL |
74 | REQ(VHOST_USER_NONE), |
75 | REQ(VHOST_USER_GET_FEATURES), | |
76 | REQ(VHOST_USER_SET_FEATURES), | |
77 | REQ(VHOST_USER_SET_OWNER), | |
78 | REQ(VHOST_USER_RESET_OWNER), | |
79 | REQ(VHOST_USER_SET_MEM_TABLE), | |
80 | REQ(VHOST_USER_SET_LOG_BASE), | |
81 | REQ(VHOST_USER_SET_LOG_FD), | |
82 | REQ(VHOST_USER_SET_VRING_NUM), | |
83 | REQ(VHOST_USER_SET_VRING_ADDR), | |
84 | REQ(VHOST_USER_SET_VRING_BASE), | |
85 | REQ(VHOST_USER_GET_VRING_BASE), | |
86 | REQ(VHOST_USER_SET_VRING_KICK), | |
87 | REQ(VHOST_USER_SET_VRING_CALL), | |
88 | REQ(VHOST_USER_SET_VRING_ERR), | |
89 | REQ(VHOST_USER_GET_PROTOCOL_FEATURES), | |
90 | REQ(VHOST_USER_SET_PROTOCOL_FEATURES), | |
91 | REQ(VHOST_USER_GET_QUEUE_NUM), | |
92 | REQ(VHOST_USER_SET_VRING_ENABLE), | |
93 | REQ(VHOST_USER_SEND_RARP), | |
ea642e22 DDAG |
94 | REQ(VHOST_USER_NET_SET_MTU), |
95 | REQ(VHOST_USER_SET_SLAVE_REQ_FD), | |
96 | REQ(VHOST_USER_IOTLB_MSG), | |
97 | REQ(VHOST_USER_SET_VRING_ENDIAN), | |
0bc24d83 CL |
98 | REQ(VHOST_USER_GET_CONFIG), |
99 | REQ(VHOST_USER_SET_CONFIG), | |
d3dff7a5 | 100 | REQ(VHOST_USER_POSTCOPY_ADVISE), |
6864a7b5 | 101 | REQ(VHOST_USER_POSTCOPY_LISTEN), |
c639187e | 102 | REQ(VHOST_USER_POSTCOPY_END), |
7b2e5c65 MAL |
103 | REQ(VHOST_USER_MAX), |
104 | }; | |
105 | #undef REQ | |
106 | ||
107 | if (req < VHOST_USER_MAX) { | |
108 | return vu_request_str[req]; | |
109 | } else { | |
110 | return "unknown"; | |
111 | } | |
112 | } | |
113 | ||
114 | static void | |
115 | vu_panic(VuDev *dev, const char *msg, ...) | |
116 | { | |
117 | char *buf = NULL; | |
118 | va_list ap; | |
119 | ||
120 | va_start(ap, msg); | |
eb078a9f MAL |
121 | if (vasprintf(&buf, msg, ap) < 0) { |
122 | buf = NULL; | |
123 | } | |
7b2e5c65 MAL |
124 | va_end(ap); |
125 | ||
126 | dev->broken = true; | |
127 | dev->panic(dev, buf); | |
128 | free(buf); | |
129 | ||
130 | /* FIXME: find a way to call virtio_error? */ | |
131 | } | |
132 | ||
133 | /* Translate guest physical address to our virtual address. */ | |
134 | void * | |
293084a7 | 135 | vu_gpa_to_va(VuDev *dev, uint64_t *plen, uint64_t guest_addr) |
7b2e5c65 MAL |
136 | { |
137 | int i; | |
138 | ||
293084a7 YX |
139 | if (*plen == 0) { |
140 | return NULL; | |
141 | } | |
142 | ||
7b2e5c65 MAL |
143 | /* Find matching memory region. */ |
144 | for (i = 0; i < dev->nregions; i++) { | |
145 | VuDevRegion *r = &dev->regions[i]; | |
146 | ||
147 | if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) { | |
293084a7 YX |
148 | if ((guest_addr + *plen) > (r->gpa + r->size)) { |
149 | *plen = r->gpa + r->size - guest_addr; | |
150 | } | |
7b2e5c65 MAL |
151 | return (void *)(uintptr_t) |
152 | guest_addr - r->gpa + r->mmap_addr + r->mmap_offset; | |
153 | } | |
154 | } | |
155 | ||
156 | return NULL; | |
157 | } | |
158 | ||
159 | /* Translate qemu virtual address to our virtual address. */ | |
160 | static void * | |
161 | qva_to_va(VuDev *dev, uint64_t qemu_addr) | |
162 | { | |
163 | int i; | |
164 | ||
165 | /* Find matching memory region. */ | |
166 | for (i = 0; i < dev->nregions; i++) { | |
167 | VuDevRegion *r = &dev->regions[i]; | |
168 | ||
169 | if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) { | |
170 | return (void *)(uintptr_t) | |
171 | qemu_addr - r->qva + r->mmap_addr + r->mmap_offset; | |
172 | } | |
173 | } | |
174 | ||
175 | return NULL; | |
176 | } | |
177 | ||
178 | static void | |
179 | vmsg_close_fds(VhostUserMsg *vmsg) | |
180 | { | |
181 | int i; | |
182 | ||
183 | for (i = 0; i < vmsg->fd_num; i++) { | |
184 | close(vmsg->fds[i]); | |
185 | } | |
186 | } | |
187 | ||
4275cd99 DDAG |
188 | /* A test to see if we have userfault available */ |
189 | static bool | |
190 | have_userfault(void) | |
191 | { | |
192 | #if defined(__linux__) && defined(__NR_userfaultfd) &&\ | |
193 | defined(UFFD_FEATURE_MISSING_SHMEM) &&\ | |
194 | defined(UFFD_FEATURE_MISSING_HUGETLBFS) | |
195 | /* Now test the kernel we're running on really has the features */ | |
196 | int ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); | |
197 | struct uffdio_api api_struct; | |
198 | if (ufd < 0) { | |
199 | return false; | |
200 | } | |
201 | ||
202 | api_struct.api = UFFD_API; | |
203 | api_struct.features = UFFD_FEATURE_MISSING_SHMEM | | |
204 | UFFD_FEATURE_MISSING_HUGETLBFS; | |
205 | if (ioctl(ufd, UFFDIO_API, &api_struct)) { | |
206 | close(ufd); | |
207 | return false; | |
208 | } | |
209 | close(ufd); | |
210 | return true; | |
211 | ||
212 | #else | |
213 | return false; | |
214 | #endif | |
215 | } | |
216 | ||
7b2e5c65 MAL |
217 | static bool |
218 | vu_message_read(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) | |
219 | { | |
220 | char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { }; | |
221 | struct iovec iov = { | |
222 | .iov_base = (char *)vmsg, | |
223 | .iov_len = VHOST_USER_HDR_SIZE, | |
224 | }; | |
225 | struct msghdr msg = { | |
226 | .msg_iov = &iov, | |
227 | .msg_iovlen = 1, | |
228 | .msg_control = control, | |
229 | .msg_controllen = sizeof(control), | |
230 | }; | |
231 | size_t fd_size; | |
232 | struct cmsghdr *cmsg; | |
233 | int rc; | |
234 | ||
235 | do { | |
236 | rc = recvmsg(conn_fd, &msg, 0); | |
237 | } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); | |
238 | ||
2566378d | 239 | if (rc < 0) { |
7b2e5c65 MAL |
240 | vu_panic(dev, "Error while recvmsg: %s", strerror(errno)); |
241 | return false; | |
242 | } | |
243 | ||
244 | vmsg->fd_num = 0; | |
245 | for (cmsg = CMSG_FIRSTHDR(&msg); | |
246 | cmsg != NULL; | |
247 | cmsg = CMSG_NXTHDR(&msg, cmsg)) | |
248 | { | |
249 | if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) { | |
250 | fd_size = cmsg->cmsg_len - CMSG_LEN(0); | |
251 | vmsg->fd_num = fd_size / sizeof(int); | |
252 | memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size); | |
253 | break; | |
254 | } | |
255 | } | |
256 | ||
257 | if (vmsg->size > sizeof(vmsg->payload)) { | |
258 | vu_panic(dev, | |
259 | "Error: too big message request: %d, size: vmsg->size: %u, " | |
260 | "while sizeof(vmsg->payload) = %zu\n", | |
261 | vmsg->request, vmsg->size, sizeof(vmsg->payload)); | |
262 | goto fail; | |
263 | } | |
264 | ||
265 | if (vmsg->size) { | |
266 | do { | |
267 | rc = read(conn_fd, &vmsg->payload, vmsg->size); | |
268 | } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); | |
269 | ||
270 | if (rc <= 0) { | |
271 | vu_panic(dev, "Error while reading: %s", strerror(errno)); | |
272 | goto fail; | |
273 | } | |
274 | ||
275 | assert(rc == vmsg->size); | |
276 | } | |
277 | ||
278 | return true; | |
279 | ||
280 | fail: | |
281 | vmsg_close_fds(vmsg); | |
282 | ||
283 | return false; | |
284 | } | |
285 | ||
286 | static bool | |
287 | vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) | |
288 | { | |
289 | int rc; | |
290 | uint8_t *p = (uint8_t *)vmsg; | |
d6e47717 DDAG |
291 | char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { }; |
292 | struct iovec iov = { | |
293 | .iov_base = (char *)vmsg, | |
294 | .iov_len = VHOST_USER_HDR_SIZE, | |
295 | }; | |
296 | struct msghdr msg = { | |
297 | .msg_iov = &iov, | |
298 | .msg_iovlen = 1, | |
299 | .msg_control = control, | |
300 | }; | |
301 | struct cmsghdr *cmsg; | |
302 | ||
303 | memset(control, 0, sizeof(control)); | |
304 | assert(vmsg->fd_num <= VHOST_MEMORY_MAX_NREGIONS); | |
305 | if (vmsg->fd_num > 0) { | |
306 | size_t fdsize = vmsg->fd_num * sizeof(int); | |
307 | msg.msg_controllen = CMSG_SPACE(fdsize); | |
308 | cmsg = CMSG_FIRSTHDR(&msg); | |
309 | cmsg->cmsg_len = CMSG_LEN(fdsize); | |
310 | cmsg->cmsg_level = SOL_SOCKET; | |
311 | cmsg->cmsg_type = SCM_RIGHTS; | |
312 | memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize); | |
313 | } else { | |
314 | msg.msg_controllen = 0; | |
315 | } | |
7b2e5c65 | 316 | |
7b2e5c65 | 317 | do { |
d6e47717 | 318 | rc = sendmsg(conn_fd, &msg, 0); |
7b2e5c65 MAL |
319 | } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); |
320 | ||
e3638151 DDAG |
321 | if (vmsg->size) { |
322 | do { | |
323 | if (vmsg->data) { | |
324 | rc = write(conn_fd, vmsg->data, vmsg->size); | |
325 | } else { | |
326 | rc = write(conn_fd, p + VHOST_USER_HDR_SIZE, vmsg->size); | |
327 | } | |
328 | } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); | |
329 | } | |
7b2e5c65 MAL |
330 | |
331 | if (rc <= 0) { | |
332 | vu_panic(dev, "Error while writing: %s", strerror(errno)); | |
333 | return false; | |
334 | } | |
335 | ||
336 | return true; | |
337 | } | |
338 | ||
d84599f5 TB |
339 | static bool |
340 | vu_send_reply(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) | |
341 | { | |
342 | /* Set the version in the flags when sending the reply */ | |
343 | vmsg->flags &= ~VHOST_USER_VERSION_MASK; | |
344 | vmsg->flags |= VHOST_USER_VERSION; | |
345 | vmsg->flags |= VHOST_USER_REPLY_MASK; | |
346 | ||
347 | return vu_message_write(dev, conn_fd, vmsg); | |
348 | } | |
349 | ||
350 | static bool | |
351 | vu_process_message_reply(VuDev *dev, const VhostUserMsg *vmsg) | |
352 | { | |
353 | VhostUserMsg msg_reply; | |
354 | ||
355 | if ((vmsg->flags & VHOST_USER_NEED_REPLY_MASK) == 0) { | |
356 | return true; | |
357 | } | |
358 | ||
359 | if (!vu_message_read(dev, dev->slave_fd, &msg_reply)) { | |
360 | return false; | |
361 | } | |
362 | ||
363 | if (msg_reply.request != vmsg->request) { | |
364 | DPRINT("Received unexpected msg type. Expected %d received %d", | |
365 | vmsg->request, msg_reply.request); | |
366 | return false; | |
367 | } | |
368 | ||
369 | return msg_reply.payload.u64 == 0; | |
370 | } | |
371 | ||
7b2e5c65 MAL |
372 | /* Kick the log_call_fd if required. */ |
373 | static void | |
374 | vu_log_kick(VuDev *dev) | |
375 | { | |
376 | if (dev->log_call_fd != -1) { | |
377 | DPRINT("Kicking the QEMU's log...\n"); | |
378 | if (eventfd_write(dev->log_call_fd, 1) < 0) { | |
379 | vu_panic(dev, "Error writing eventfd: %s", strerror(errno)); | |
380 | } | |
381 | } | |
382 | } | |
383 | ||
384 | static void | |
385 | vu_log_page(uint8_t *log_table, uint64_t page) | |
386 | { | |
387 | DPRINT("Logged dirty guest page: %"PRId64"\n", page); | |
388 | atomic_or(&log_table[page / 8], 1 << (page % 8)); | |
389 | } | |
390 | ||
391 | static void | |
392 | vu_log_write(VuDev *dev, uint64_t address, uint64_t length) | |
393 | { | |
394 | uint64_t page; | |
395 | ||
396 | if (!(dev->features & (1ULL << VHOST_F_LOG_ALL)) || | |
397 | !dev->log_table || !length) { | |
398 | return; | |
399 | } | |
400 | ||
401 | assert(dev->log_size > ((address + length - 1) / VHOST_LOG_PAGE / 8)); | |
402 | ||
403 | page = address / VHOST_LOG_PAGE; | |
404 | while (page * VHOST_LOG_PAGE < address + length) { | |
405 | vu_log_page(dev->log_table, page); | |
406 | page += VHOST_LOG_PAGE; | |
407 | } | |
408 | ||
409 | vu_log_kick(dev); | |
410 | } | |
411 | ||
412 | static void | |
413 | vu_kick_cb(VuDev *dev, int condition, void *data) | |
414 | { | |
415 | int index = (intptr_t)data; | |
416 | VuVirtq *vq = &dev->vq[index]; | |
417 | int sock = vq->kick_fd; | |
418 | eventfd_t kick_data; | |
419 | ssize_t rc; | |
420 | ||
421 | rc = eventfd_read(sock, &kick_data); | |
422 | if (rc == -1) { | |
423 | vu_panic(dev, "kick eventfd_read(): %s", strerror(errno)); | |
424 | dev->remove_watch(dev, dev->vq[index].kick_fd); | |
425 | } else { | |
426 | DPRINT("Got kick_data: %016"PRIx64" handler:%p idx:%d\n", | |
427 | kick_data, vq->handler, index); | |
428 | if (vq->handler) { | |
429 | vq->handler(dev, index); | |
430 | } | |
431 | } | |
432 | } | |
433 | ||
434 | static bool | |
435 | vu_get_features_exec(VuDev *dev, VhostUserMsg *vmsg) | |
436 | { | |
437 | vmsg->payload.u64 = | |
438 | 1ULL << VHOST_F_LOG_ALL | | |
439 | 1ULL << VHOST_USER_F_PROTOCOL_FEATURES; | |
440 | ||
441 | if (dev->iface->get_features) { | |
442 | vmsg->payload.u64 |= dev->iface->get_features(dev); | |
443 | } | |
444 | ||
445 | vmsg->size = sizeof(vmsg->payload.u64); | |
d6e47717 | 446 | vmsg->fd_num = 0; |
7b2e5c65 MAL |
447 | |
448 | DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64); | |
449 | ||
450 | return true; | |
451 | } | |
452 | ||
453 | static void | |
454 | vu_set_enable_all_rings(VuDev *dev, bool enabled) | |
455 | { | |
456 | int i; | |
457 | ||
458 | for (i = 0; i < VHOST_MAX_NR_VIRTQUEUE; i++) { | |
459 | dev->vq[i].enable = enabled; | |
460 | } | |
461 | } | |
462 | ||
463 | static bool | |
464 | vu_set_features_exec(VuDev *dev, VhostUserMsg *vmsg) | |
465 | { | |
466 | DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); | |
467 | ||
468 | dev->features = vmsg->payload.u64; | |
469 | ||
470 | if (!(dev->features & VHOST_USER_F_PROTOCOL_FEATURES)) { | |
471 | vu_set_enable_all_rings(dev, true); | |
472 | } | |
473 | ||
474 | if (dev->iface->set_features) { | |
475 | dev->iface->set_features(dev, dev->features); | |
476 | } | |
477 | ||
478 | return false; | |
479 | } | |
480 | ||
481 | static bool | |
482 | vu_set_owner_exec(VuDev *dev, VhostUserMsg *vmsg) | |
483 | { | |
484 | return false; | |
485 | } | |
486 | ||
487 | static void | |
488 | vu_close_log(VuDev *dev) | |
489 | { | |
490 | if (dev->log_table) { | |
491 | if (munmap(dev->log_table, dev->log_size) != 0) { | |
492 | perror("close log munmap() error"); | |
493 | } | |
494 | ||
495 | dev->log_table = NULL; | |
496 | } | |
497 | if (dev->log_call_fd != -1) { | |
498 | close(dev->log_call_fd); | |
499 | dev->log_call_fd = -1; | |
500 | } | |
501 | } | |
502 | ||
503 | static bool | |
504 | vu_reset_device_exec(VuDev *dev, VhostUserMsg *vmsg) | |
505 | { | |
506 | vu_set_enable_all_rings(dev, false); | |
507 | ||
508 | return false; | |
509 | } | |
510 | ||
55d754b3 DDAG |
511 | static bool |
512 | vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg) | |
513 | { | |
514 | int i; | |
515 | VhostUserMemory *memory = &vmsg->payload.memory; | |
516 | dev->nregions = memory->nregions; | |
0185cfb3 | 517 | |
55d754b3 DDAG |
518 | DPRINT("Nregions: %d\n", memory->nregions); |
519 | for (i = 0; i < dev->nregions; i++) { | |
520 | void *mmap_addr; | |
521 | VhostUserMemoryRegion *msg_region = &memory->regions[i]; | |
522 | VuDevRegion *dev_region = &dev->regions[i]; | |
523 | ||
524 | DPRINT("Region %d\n", i); | |
525 | DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", | |
526 | msg_region->guest_phys_addr); | |
527 | DPRINT(" memory_size: 0x%016"PRIx64"\n", | |
528 | msg_region->memory_size); | |
529 | DPRINT(" userspace_addr 0x%016"PRIx64"\n", | |
530 | msg_region->userspace_addr); | |
531 | DPRINT(" mmap_offset 0x%016"PRIx64"\n", | |
532 | msg_region->mmap_offset); | |
533 | ||
534 | dev_region->gpa = msg_region->guest_phys_addr; | |
535 | dev_region->size = msg_region->memory_size; | |
536 | dev_region->qva = msg_region->userspace_addr; | |
537 | dev_region->mmap_offset = msg_region->mmap_offset; | |
538 | ||
539 | /* We don't use offset argument of mmap() since the | |
540 | * mapped address has to be page aligned, and we use huge | |
0185cfb3 DDAG |
541 | * pages. |
542 | * In postcopy we're using PROT_NONE here to catch anyone | |
543 | * accessing it before we userfault | |
544 | */ | |
55d754b3 | 545 | mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, |
0185cfb3 | 546 | PROT_NONE, MAP_SHARED, |
55d754b3 DDAG |
547 | vmsg->fds[i], 0); |
548 | ||
549 | if (mmap_addr == MAP_FAILED) { | |
550 | vu_panic(dev, "region mmap error: %s", strerror(errno)); | |
551 | } else { | |
552 | dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; | |
553 | DPRINT(" mmap_addr: 0x%016"PRIx64"\n", | |
554 | dev_region->mmap_addr); | |
555 | } | |
556 | ||
9bb38019 DDAG |
557 | /* Return the address to QEMU so that it can translate the ufd |
558 | * fault addresses back. | |
559 | */ | |
560 | msg_region->userspace_addr = (uintptr_t)(mmap_addr + | |
561 | dev_region->mmap_offset); | |
55d754b3 DDAG |
562 | close(vmsg->fds[i]); |
563 | } | |
564 | ||
9bb38019 DDAG |
565 | /* Send the message back to qemu with the addresses filled in */ |
566 | vmsg->fd_num = 0; | |
d84599f5 | 567 | if (!vu_send_reply(dev, dev->sock, vmsg)) { |
9bb38019 DDAG |
568 | vu_panic(dev, "failed to respond to set-mem-table for postcopy"); |
569 | return false; | |
570 | } | |
571 | ||
572 | /* Wait for QEMU to confirm that it's registered the handler for the | |
573 | * faults. | |
574 | */ | |
575 | if (!vu_message_read(dev, dev->sock, vmsg) || | |
576 | vmsg->size != sizeof(vmsg->payload.u64) || | |
577 | vmsg->payload.u64 != 0) { | |
578 | vu_panic(dev, "failed to receive valid ack for postcopy set-mem-table"); | |
579 | return false; | |
580 | } | |
581 | ||
582 | /* OK, now we can go and register the memory and generate faults */ | |
51a5d6e5 DDAG |
583 | for (i = 0; i < dev->nregions; i++) { |
584 | VuDevRegion *dev_region = &dev->regions[i]; | |
0185cfb3 | 585 | int ret; |
51a5d6e5 DDAG |
586 | #ifdef UFFDIO_REGISTER |
587 | /* We should already have an open ufd. Mark each memory | |
588 | * range as ufd. | |
0185cfb3 DDAG |
589 | * Discard any mapping we have here; note I can't use MADV_REMOVE |
590 | * or fallocate to make the hole since I don't want to lose | |
591 | * data that's already arrived in the shared process. | |
592 | * TODO: How to do hugepage | |
593 | */ | |
594 | ret = madvise((void *)dev_region->mmap_addr, | |
595 | dev_region->size + dev_region->mmap_offset, | |
596 | MADV_DONTNEED); | |
597 | if (ret) { | |
598 | fprintf(stderr, | |
599 | "%s: Failed to madvise(DONTNEED) region %d: %s\n", | |
600 | __func__, i, strerror(errno)); | |
601 | } | |
602 | /* Turn off transparent hugepages so we dont get lose wakeups | |
603 | * in neighbouring pages. | |
604 | * TODO: Turn this backon later. | |
51a5d6e5 | 605 | */ |
0185cfb3 DDAG |
606 | ret = madvise((void *)dev_region->mmap_addr, |
607 | dev_region->size + dev_region->mmap_offset, | |
608 | MADV_NOHUGEPAGE); | |
609 | if (ret) { | |
610 | /* Note: This can happen legally on kernels that are configured | |
611 | * without madvise'able hugepages | |
612 | */ | |
613 | fprintf(stderr, | |
614 | "%s: Failed to madvise(NOHUGEPAGE) region %d: %s\n", | |
615 | __func__, i, strerror(errno)); | |
616 | } | |
51a5d6e5 DDAG |
617 | struct uffdio_register reg_struct; |
618 | reg_struct.range.start = (uintptr_t)dev_region->mmap_addr; | |
619 | reg_struct.range.len = dev_region->size + dev_region->mmap_offset; | |
620 | reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; | |
621 | ||
622 | if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, ®_struct)) { | |
623 | vu_panic(dev, "%s: Failed to userfault region %d " | |
624 | "@%p + size:%zx offset: %zx: (ufd=%d)%s\n", | |
625 | __func__, i, | |
626 | dev_region->mmap_addr, | |
627 | dev_region->size, dev_region->mmap_offset, | |
628 | dev->postcopy_ufd, strerror(errno)); | |
629 | return false; | |
630 | } | |
631 | if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) { | |
632 | vu_panic(dev, "%s Region (%d) doesn't support COPY", | |
633 | __func__, i); | |
634 | return false; | |
635 | } | |
636 | DPRINT("%s: region %d: Registered userfault for %llx + %llx\n", | |
637 | __func__, i, reg_struct.range.start, reg_struct.range.len); | |
0185cfb3 DDAG |
638 | /* Now it's registered we can let the client at it */ |
639 | if (mprotect((void *)dev_region->mmap_addr, | |
640 | dev_region->size + dev_region->mmap_offset, | |
641 | PROT_READ | PROT_WRITE)) { | |
642 | vu_panic(dev, "failed to mprotect region %d for postcopy (%s)", | |
643 | i, strerror(errno)); | |
644 | return false; | |
645 | } | |
51a5d6e5 DDAG |
646 | /* TODO: Stash 'zero' support flags somewhere */ |
647 | #endif | |
648 | } | |
649 | ||
55d754b3 DDAG |
650 | return false; |
651 | } | |
652 | ||
7b2e5c65 MAL |
653 | static bool |
654 | vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) | |
655 | { | |
656 | int i; | |
657 | VhostUserMemory *memory = &vmsg->payload.memory; | |
bb102d1d YX |
658 | |
659 | for (i = 0; i < dev->nregions; i++) { | |
660 | VuDevRegion *r = &dev->regions[i]; | |
661 | void *m = (void *) (uintptr_t) r->mmap_addr; | |
662 | ||
663 | if (m) { | |
664 | munmap(m, r->size + r->mmap_offset); | |
665 | } | |
666 | } | |
7b2e5c65 MAL |
667 | dev->nregions = memory->nregions; |
668 | ||
55d754b3 DDAG |
669 | if (dev->postcopy_listening) { |
670 | return vu_set_mem_table_exec_postcopy(dev, vmsg); | |
671 | } | |
672 | ||
7b2e5c65 MAL |
673 | DPRINT("Nregions: %d\n", memory->nregions); |
674 | for (i = 0; i < dev->nregions; i++) { | |
675 | void *mmap_addr; | |
676 | VhostUserMemoryRegion *msg_region = &memory->regions[i]; | |
677 | VuDevRegion *dev_region = &dev->regions[i]; | |
678 | ||
679 | DPRINT("Region %d\n", i); | |
680 | DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", | |
681 | msg_region->guest_phys_addr); | |
682 | DPRINT(" memory_size: 0x%016"PRIx64"\n", | |
683 | msg_region->memory_size); | |
684 | DPRINT(" userspace_addr 0x%016"PRIx64"\n", | |
685 | msg_region->userspace_addr); | |
686 | DPRINT(" mmap_offset 0x%016"PRIx64"\n", | |
687 | msg_region->mmap_offset); | |
688 | ||
689 | dev_region->gpa = msg_region->guest_phys_addr; | |
690 | dev_region->size = msg_region->memory_size; | |
691 | dev_region->qva = msg_region->userspace_addr; | |
692 | dev_region->mmap_offset = msg_region->mmap_offset; | |
693 | ||
694 | /* We don't use offset argument of mmap() since the | |
695 | * mapped address has to be page aligned, and we use huge | |
696 | * pages. */ | |
697 | mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, | |
698 | PROT_READ | PROT_WRITE, MAP_SHARED, | |
699 | vmsg->fds[i], 0); | |
700 | ||
701 | if (mmap_addr == MAP_FAILED) { | |
702 | vu_panic(dev, "region mmap error: %s", strerror(errno)); | |
703 | } else { | |
704 | dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; | |
705 | DPRINT(" mmap_addr: 0x%016"PRIx64"\n", | |
706 | dev_region->mmap_addr); | |
707 | } | |
708 | ||
709 | close(vmsg->fds[i]); | |
710 | } | |
711 | ||
712 | return false; | |
713 | } | |
714 | ||
715 | static bool | |
716 | vu_set_log_base_exec(VuDev *dev, VhostUserMsg *vmsg) | |
717 | { | |
718 | int fd; | |
719 | uint64_t log_mmap_size, log_mmap_offset; | |
720 | void *rc; | |
721 | ||
722 | if (vmsg->fd_num != 1 || | |
723 | vmsg->size != sizeof(vmsg->payload.log)) { | |
724 | vu_panic(dev, "Invalid log_base message"); | |
725 | return true; | |
726 | } | |
727 | ||
728 | fd = vmsg->fds[0]; | |
729 | log_mmap_offset = vmsg->payload.log.mmap_offset; | |
730 | log_mmap_size = vmsg->payload.log.mmap_size; | |
731 | DPRINT("Log mmap_offset: %"PRId64"\n", log_mmap_offset); | |
732 | DPRINT("Log mmap_size: %"PRId64"\n", log_mmap_size); | |
733 | ||
734 | rc = mmap(0, log_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, | |
735 | log_mmap_offset); | |
bb102d1d | 736 | close(fd); |
7b2e5c65 MAL |
737 | if (rc == MAP_FAILED) { |
738 | perror("log mmap error"); | |
739 | } | |
bb102d1d YX |
740 | |
741 | if (dev->log_table) { | |
742 | munmap(dev->log_table, dev->log_size); | |
743 | } | |
7b2e5c65 MAL |
744 | dev->log_table = rc; |
745 | dev->log_size = log_mmap_size; | |
746 | ||
747 | vmsg->size = sizeof(vmsg->payload.u64); | |
d6e47717 | 748 | vmsg->fd_num = 0; |
7b2e5c65 MAL |
749 | |
750 | return true; | |
751 | } | |
752 | ||
753 | static bool | |
754 | vu_set_log_fd_exec(VuDev *dev, VhostUserMsg *vmsg) | |
755 | { | |
756 | if (vmsg->fd_num != 1) { | |
757 | vu_panic(dev, "Invalid log_fd message"); | |
758 | return false; | |
759 | } | |
760 | ||
761 | if (dev->log_call_fd != -1) { | |
762 | close(dev->log_call_fd); | |
763 | } | |
764 | dev->log_call_fd = vmsg->fds[0]; | |
765 | DPRINT("Got log_call_fd: %d\n", vmsg->fds[0]); | |
766 | ||
767 | return false; | |
768 | } | |
769 | ||
770 | static bool | |
771 | vu_set_vring_num_exec(VuDev *dev, VhostUserMsg *vmsg) | |
772 | { | |
773 | unsigned int index = vmsg->payload.state.index; | |
774 | unsigned int num = vmsg->payload.state.num; | |
775 | ||
776 | DPRINT("State.index: %d\n", index); | |
777 | DPRINT("State.num: %d\n", num); | |
778 | dev->vq[index].vring.num = num; | |
779 | ||
780 | return false; | |
781 | } | |
782 | ||
783 | static bool | |
784 | vu_set_vring_addr_exec(VuDev *dev, VhostUserMsg *vmsg) | |
785 | { | |
786 | struct vhost_vring_addr *vra = &vmsg->payload.addr; | |
787 | unsigned int index = vra->index; | |
788 | VuVirtq *vq = &dev->vq[index]; | |
789 | ||
790 | DPRINT("vhost_vring_addr:\n"); | |
791 | DPRINT(" index: %d\n", vra->index); | |
792 | DPRINT(" flags: %d\n", vra->flags); | |
8bafafcc MT |
793 | DPRINT(" desc_user_addr: 0x%016" PRIx64 "\n", vra->desc_user_addr); |
794 | DPRINT(" used_user_addr: 0x%016" PRIx64 "\n", vra->used_user_addr); | |
795 | DPRINT(" avail_user_addr: 0x%016" PRIx64 "\n", vra->avail_user_addr); | |
796 | DPRINT(" log_guest_addr: 0x%016" PRIx64 "\n", vra->log_guest_addr); | |
7b2e5c65 MAL |
797 | |
798 | vq->vring.flags = vra->flags; | |
799 | vq->vring.desc = qva_to_va(dev, vra->desc_user_addr); | |
800 | vq->vring.used = qva_to_va(dev, vra->used_user_addr); | |
801 | vq->vring.avail = qva_to_va(dev, vra->avail_user_addr); | |
802 | vq->vring.log_guest_addr = vra->log_guest_addr; | |
803 | ||
804 | DPRINT("Setting virtq addresses:\n"); | |
805 | DPRINT(" vring_desc at %p\n", vq->vring.desc); | |
806 | DPRINT(" vring_used at %p\n", vq->vring.used); | |
807 | DPRINT(" vring_avail at %p\n", vq->vring.avail); | |
808 | ||
809 | if (!(vq->vring.desc && vq->vring.used && vq->vring.avail)) { | |
810 | vu_panic(dev, "Invalid vring_addr message"); | |
811 | return false; | |
812 | } | |
813 | ||
814 | vq->used_idx = vq->vring.used->idx; | |
815 | ||
35480cbf MAL |
816 | if (vq->last_avail_idx != vq->used_idx) { |
817 | bool resume = dev->iface->queue_is_processed_in_order && | |
818 | dev->iface->queue_is_processed_in_order(dev, index); | |
819 | ||
820 | DPRINT("Last avail index != used index: %u != %u%s\n", | |
821 | vq->last_avail_idx, vq->used_idx, | |
822 | resume ? ", resuming" : ""); | |
823 | ||
824 | if (resume) { | |
825 | vq->shadow_avail_idx = vq->last_avail_idx = vq->used_idx; | |
826 | } | |
827 | } | |
828 | ||
7b2e5c65 MAL |
829 | return false; |
830 | } | |
831 | ||
832 | static bool | |
833 | vu_set_vring_base_exec(VuDev *dev, VhostUserMsg *vmsg) | |
834 | { | |
835 | unsigned int index = vmsg->payload.state.index; | |
836 | unsigned int num = vmsg->payload.state.num; | |
837 | ||
838 | DPRINT("State.index: %d\n", index); | |
839 | DPRINT("State.num: %d\n", num); | |
840 | dev->vq[index].shadow_avail_idx = dev->vq[index].last_avail_idx = num; | |
841 | ||
842 | return false; | |
843 | } | |
844 | ||
845 | static bool | |
846 | vu_get_vring_base_exec(VuDev *dev, VhostUserMsg *vmsg) | |
847 | { | |
848 | unsigned int index = vmsg->payload.state.index; | |
849 | ||
850 | DPRINT("State.index: %d\n", index); | |
851 | vmsg->payload.state.num = dev->vq[index].last_avail_idx; | |
852 | vmsg->size = sizeof(vmsg->payload.state); | |
853 | ||
854 | dev->vq[index].started = false; | |
855 | if (dev->iface->queue_set_started) { | |
856 | dev->iface->queue_set_started(dev, index, false); | |
857 | } | |
858 | ||
859 | if (dev->vq[index].call_fd != -1) { | |
860 | close(dev->vq[index].call_fd); | |
861 | dev->vq[index].call_fd = -1; | |
862 | } | |
863 | if (dev->vq[index].kick_fd != -1) { | |
864 | dev->remove_watch(dev, dev->vq[index].kick_fd); | |
865 | close(dev->vq[index].kick_fd); | |
866 | dev->vq[index].kick_fd = -1; | |
867 | } | |
868 | ||
869 | return true; | |
870 | } | |
871 | ||
872 | static bool | |
873 | vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg) | |
874 | { | |
875 | int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; | |
876 | ||
877 | if (index >= VHOST_MAX_NR_VIRTQUEUE) { | |
878 | vmsg_close_fds(vmsg); | |
879 | vu_panic(dev, "Invalid queue index: %u", index); | |
880 | return false; | |
881 | } | |
882 | ||
883 | if (vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK || | |
884 | vmsg->fd_num != 1) { | |
885 | vmsg_close_fds(vmsg); | |
886 | vu_panic(dev, "Invalid fds in request: %d", vmsg->request); | |
887 | return false; | |
888 | } | |
889 | ||
890 | return true; | |
891 | } | |
892 | ||
893 | static bool | |
894 | vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg) | |
895 | { | |
896 | int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; | |
897 | ||
898 | DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); | |
899 | ||
900 | if (!vu_check_queue_msg_file(dev, vmsg)) { | |
901 | return false; | |
902 | } | |
903 | ||
904 | if (dev->vq[index].kick_fd != -1) { | |
905 | dev->remove_watch(dev, dev->vq[index].kick_fd); | |
906 | close(dev->vq[index].kick_fd); | |
907 | dev->vq[index].kick_fd = -1; | |
908 | } | |
909 | ||
910 | if (!(vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) { | |
911 | dev->vq[index].kick_fd = vmsg->fds[0]; | |
912 | DPRINT("Got kick_fd: %d for vq: %d\n", vmsg->fds[0], index); | |
913 | } | |
914 | ||
915 | dev->vq[index].started = true; | |
916 | if (dev->iface->queue_set_started) { | |
917 | dev->iface->queue_set_started(dev, index, true); | |
918 | } | |
919 | ||
920 | if (dev->vq[index].kick_fd != -1 && dev->vq[index].handler) { | |
921 | dev->set_watch(dev, dev->vq[index].kick_fd, VU_WATCH_IN, | |
922 | vu_kick_cb, (void *)(long)index); | |
923 | ||
924 | DPRINT("Waiting for kicks on fd: %d for vq: %d\n", | |
925 | dev->vq[index].kick_fd, index); | |
926 | } | |
927 | ||
928 | return false; | |
929 | } | |
930 | ||
931 | void vu_set_queue_handler(VuDev *dev, VuVirtq *vq, | |
932 | vu_queue_handler_cb handler) | |
933 | { | |
934 | int qidx = vq - dev->vq; | |
935 | ||
936 | vq->handler = handler; | |
937 | if (vq->kick_fd >= 0) { | |
938 | if (handler) { | |
939 | dev->set_watch(dev, vq->kick_fd, VU_WATCH_IN, | |
940 | vu_kick_cb, (void *)(long)qidx); | |
941 | } else { | |
942 | dev->remove_watch(dev, vq->kick_fd); | |
943 | } | |
944 | } | |
945 | } | |
946 | ||
d84599f5 TB |
947 | bool vu_set_queue_host_notifier(VuDev *dev, VuVirtq *vq, int fd, |
948 | int size, int offset) | |
949 | { | |
950 | int qidx = vq - dev->vq; | |
951 | int fd_num = 0; | |
952 | VhostUserMsg vmsg = { | |
953 | .request = VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG, | |
954 | .flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK, | |
955 | .size = sizeof(vmsg.payload.area), | |
956 | .payload.area = { | |
957 | .u64 = qidx & VHOST_USER_VRING_IDX_MASK, | |
958 | .size = size, | |
959 | .offset = offset, | |
960 | }, | |
961 | }; | |
962 | ||
963 | if (fd == -1) { | |
964 | vmsg.payload.area.u64 |= VHOST_USER_VRING_NOFD_MASK; | |
965 | } else { | |
966 | vmsg.fds[fd_num++] = fd; | |
967 | } | |
968 | ||
969 | vmsg.fd_num = fd_num; | |
970 | ||
971 | if ((dev->protocol_features & VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD) == 0) { | |
972 | return false; | |
973 | } | |
974 | ||
975 | if (!vu_message_write(dev, dev->slave_fd, &vmsg)) { | |
976 | return false; | |
977 | } | |
978 | ||
979 | return vu_process_message_reply(dev, &vmsg); | |
980 | } | |
981 | ||
7b2e5c65 MAL |
982 | static bool |
983 | vu_set_vring_call_exec(VuDev *dev, VhostUserMsg *vmsg) | |
984 | { | |
985 | int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; | |
986 | ||
987 | DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); | |
988 | ||
989 | if (!vu_check_queue_msg_file(dev, vmsg)) { | |
990 | return false; | |
991 | } | |
992 | ||
993 | if (dev->vq[index].call_fd != -1) { | |
994 | close(dev->vq[index].call_fd); | |
995 | dev->vq[index].call_fd = -1; | |
996 | } | |
997 | ||
998 | if (!(vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) { | |
999 | dev->vq[index].call_fd = vmsg->fds[0]; | |
1000 | } | |
1001 | ||
1002 | DPRINT("Got call_fd: %d for vq: %d\n", vmsg->fds[0], index); | |
1003 | ||
1004 | return false; | |
1005 | } | |
1006 | ||
1007 | static bool | |
1008 | vu_set_vring_err_exec(VuDev *dev, VhostUserMsg *vmsg) | |
1009 | { | |
1010 | int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; | |
1011 | ||
1012 | DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); | |
1013 | ||
1014 | if (!vu_check_queue_msg_file(dev, vmsg)) { | |
1015 | return false; | |
1016 | } | |
1017 | ||
1018 | if (dev->vq[index].err_fd != -1) { | |
1019 | close(dev->vq[index].err_fd); | |
1020 | dev->vq[index].err_fd = -1; | |
1021 | } | |
1022 | ||
1023 | if (!(vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) { | |
1024 | dev->vq[index].err_fd = vmsg->fds[0]; | |
1025 | } | |
1026 | ||
1027 | return false; | |
1028 | } | |
1029 | ||
1030 | static bool | |
1031 | vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) | |
1032 | { | |
13384f15 | 1033 | uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD | |
d84599f5 TB |
1034 | 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | |
1035 | 1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | | |
1036 | 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD; | |
7b2e5c65 | 1037 | |
4275cd99 DDAG |
1038 | if (have_userfault()) { |
1039 | features |= 1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT; | |
1040 | } | |
1041 | ||
7b2e5c65 MAL |
1042 | if (dev->iface->get_protocol_features) { |
1043 | features |= dev->iface->get_protocol_features(dev); | |
1044 | } | |
1045 | ||
1046 | vmsg->payload.u64 = features; | |
1047 | vmsg->size = sizeof(vmsg->payload.u64); | |
d6e47717 | 1048 | vmsg->fd_num = 0; |
7b2e5c65 MAL |
1049 | |
1050 | return true; | |
1051 | } | |
1052 | ||
1053 | static bool | |
1054 | vu_set_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) | |
1055 | { | |
1056 | uint64_t features = vmsg->payload.u64; | |
1057 | ||
1058 | DPRINT("u64: 0x%016"PRIx64"\n", features); | |
1059 | ||
1060 | dev->protocol_features = vmsg->payload.u64; | |
1061 | ||
1062 | if (dev->iface->set_protocol_features) { | |
1063 | dev->iface->set_protocol_features(dev, features); | |
1064 | } | |
1065 | ||
1066 | return false; | |
1067 | } | |
1068 | ||
1069 | static bool | |
1070 | vu_get_queue_num_exec(VuDev *dev, VhostUserMsg *vmsg) | |
1071 | { | |
1072 | DPRINT("Function %s() not implemented yet.\n", __func__); | |
1073 | return false; | |
1074 | } | |
1075 | ||
1076 | static bool | |
1077 | vu_set_vring_enable_exec(VuDev *dev, VhostUserMsg *vmsg) | |
1078 | { | |
1079 | unsigned int index = vmsg->payload.state.index; | |
1080 | unsigned int enable = vmsg->payload.state.num; | |
1081 | ||
1082 | DPRINT("State.index: %d\n", index); | |
1083 | DPRINT("State.enable: %d\n", enable); | |
1084 | ||
1085 | if (index >= VHOST_MAX_NR_VIRTQUEUE) { | |
1086 | vu_panic(dev, "Invalid vring_enable index: %u", index); | |
1087 | return false; | |
1088 | } | |
1089 | ||
1090 | dev->vq[index].enable = enable; | |
1091 | return false; | |
1092 | } | |
1093 | ||
13384f15 DDAG |
1094 | static bool |
1095 | vu_set_slave_req_fd(VuDev *dev, VhostUserMsg *vmsg) | |
1096 | { | |
1097 | if (vmsg->fd_num != 1) { | |
1098 | vu_panic(dev, "Invalid slave_req_fd message (%d fd's)", vmsg->fd_num); | |
1099 | return false; | |
1100 | } | |
1101 | ||
1102 | if (dev->slave_fd != -1) { | |
1103 | close(dev->slave_fd); | |
1104 | } | |
1105 | dev->slave_fd = vmsg->fds[0]; | |
1106 | DPRINT("Got slave_fd: %d\n", vmsg->fds[0]); | |
1107 | ||
1108 | return false; | |
1109 | } | |
1110 | ||
0bc24d83 CL |
1111 | static bool |
1112 | vu_get_config(VuDev *dev, VhostUserMsg *vmsg) | |
1113 | { | |
1114 | int ret = -1; | |
1115 | ||
1116 | if (dev->iface->get_config) { | |
1117 | ret = dev->iface->get_config(dev, vmsg->payload.config.region, | |
1118 | vmsg->payload.config.size); | |
1119 | } | |
1120 | ||
1121 | if (ret) { | |
1122 | /* resize to zero to indicate an error to master */ | |
1123 | vmsg->size = 0; | |
1124 | } | |
1125 | ||
1126 | return true; | |
1127 | } | |
1128 | ||
1129 | static bool | |
1130 | vu_set_config(VuDev *dev, VhostUserMsg *vmsg) | |
1131 | { | |
1132 | int ret = -1; | |
1133 | ||
1134 | if (dev->iface->set_config) { | |
1135 | ret = dev->iface->set_config(dev, vmsg->payload.config.region, | |
1136 | vmsg->payload.config.offset, | |
1137 | vmsg->payload.config.size, | |
1138 | vmsg->payload.config.flags); | |
1139 | if (ret) { | |
1140 | vu_panic(dev, "Set virtio configuration space failed"); | |
1141 | } | |
1142 | } | |
1143 | ||
1144 | return false; | |
1145 | } | |
1146 | ||
d3dff7a5 DDAG |
1147 | static bool |
1148 | vu_set_postcopy_advise(VuDev *dev, VhostUserMsg *vmsg) | |
1149 | { | |
2a84ffc0 DDAG |
1150 | dev->postcopy_ufd = -1; |
1151 | #ifdef UFFDIO_API | |
1152 | struct uffdio_api api_struct; | |
1153 | ||
1154 | dev->postcopy_ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); | |
d3dff7a5 | 1155 | vmsg->size = 0; |
2a84ffc0 DDAG |
1156 | #endif |
1157 | ||
1158 | if (dev->postcopy_ufd == -1) { | |
1159 | vu_panic(dev, "Userfaultfd not available: %s", strerror(errno)); | |
1160 | goto out; | |
1161 | } | |
1162 | ||
1163 | #ifdef UFFDIO_API | |
1164 | api_struct.api = UFFD_API; | |
1165 | api_struct.features = 0; | |
1166 | if (ioctl(dev->postcopy_ufd, UFFDIO_API, &api_struct)) { | |
1167 | vu_panic(dev, "Failed UFFDIO_API: %s", strerror(errno)); | |
1168 | close(dev->postcopy_ufd); | |
1169 | dev->postcopy_ufd = -1; | |
1170 | goto out; | |
1171 | } | |
1172 | /* TODO: Stash feature flags somewhere */ | |
1173 | #endif | |
1174 | ||
1175 | out: | |
1176 | /* Return a ufd to the QEMU */ | |
1177 | vmsg->fd_num = 1; | |
1178 | vmsg->fds[0] = dev->postcopy_ufd; | |
d3dff7a5 DDAG |
1179 | return true; /* = send a reply */ |
1180 | } | |
1181 | ||
6864a7b5 DDAG |
1182 | static bool |
1183 | vu_set_postcopy_listen(VuDev *dev, VhostUserMsg *vmsg) | |
1184 | { | |
1185 | vmsg->payload.u64 = -1; | |
1186 | vmsg->size = sizeof(vmsg->payload.u64); | |
1187 | ||
1188 | if (dev->nregions) { | |
1189 | vu_panic(dev, "Regions already registered at postcopy-listen"); | |
1190 | return true; | |
1191 | } | |
1192 | dev->postcopy_listening = true; | |
1193 | ||
1194 | vmsg->flags = VHOST_USER_VERSION | VHOST_USER_REPLY_MASK; | |
1195 | vmsg->payload.u64 = 0; /* Success */ | |
1196 | return true; | |
1197 | } | |
c639187e DDAG |
1198 | |
1199 | static bool | |
1200 | vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg) | |
1201 | { | |
1202 | DPRINT("%s: Entry\n", __func__); | |
1203 | dev->postcopy_listening = false; | |
1204 | if (dev->postcopy_ufd > 0) { | |
1205 | close(dev->postcopy_ufd); | |
1206 | dev->postcopy_ufd = -1; | |
1207 | DPRINT("%s: Done close\n", __func__); | |
1208 | } | |
1209 | ||
1210 | vmsg->fd_num = 0; | |
1211 | vmsg->payload.u64 = 0; | |
1212 | vmsg->size = sizeof(vmsg->payload.u64); | |
1213 | vmsg->flags = VHOST_USER_VERSION | VHOST_USER_REPLY_MASK; | |
1214 | DPRINT("%s: exit\n", __func__); | |
1215 | return true; | |
1216 | } | |
1217 | ||
7b2e5c65 MAL |
1218 | static bool |
1219 | vu_process_message(VuDev *dev, VhostUserMsg *vmsg) | |
1220 | { | |
1221 | int do_reply = 0; | |
1222 | ||
1223 | /* Print out generic part of the request. */ | |
1224 | DPRINT("================ Vhost user message ================\n"); | |
1225 | DPRINT("Request: %s (%d)\n", vu_request_to_string(vmsg->request), | |
1226 | vmsg->request); | |
1227 | DPRINT("Flags: 0x%x\n", vmsg->flags); | |
1228 | DPRINT("Size: %d\n", vmsg->size); | |
1229 | ||
1230 | if (vmsg->fd_num) { | |
1231 | int i; | |
1232 | DPRINT("Fds:"); | |
1233 | for (i = 0; i < vmsg->fd_num; i++) { | |
1234 | DPRINT(" %d", vmsg->fds[i]); | |
1235 | } | |
1236 | DPRINT("\n"); | |
1237 | } | |
1238 | ||
1239 | if (dev->iface->process_msg && | |
1240 | dev->iface->process_msg(dev, vmsg, &do_reply)) { | |
1241 | return do_reply; | |
1242 | } | |
1243 | ||
1244 | switch (vmsg->request) { | |
1245 | case VHOST_USER_GET_FEATURES: | |
1246 | return vu_get_features_exec(dev, vmsg); | |
1247 | case VHOST_USER_SET_FEATURES: | |
1248 | return vu_set_features_exec(dev, vmsg); | |
1249 | case VHOST_USER_GET_PROTOCOL_FEATURES: | |
1250 | return vu_get_protocol_features_exec(dev, vmsg); | |
1251 | case VHOST_USER_SET_PROTOCOL_FEATURES: | |
1252 | return vu_set_protocol_features_exec(dev, vmsg); | |
1253 | case VHOST_USER_SET_OWNER: | |
1254 | return vu_set_owner_exec(dev, vmsg); | |
1255 | case VHOST_USER_RESET_OWNER: | |
1256 | return vu_reset_device_exec(dev, vmsg); | |
1257 | case VHOST_USER_SET_MEM_TABLE: | |
1258 | return vu_set_mem_table_exec(dev, vmsg); | |
1259 | case VHOST_USER_SET_LOG_BASE: | |
1260 | return vu_set_log_base_exec(dev, vmsg); | |
1261 | case VHOST_USER_SET_LOG_FD: | |
1262 | return vu_set_log_fd_exec(dev, vmsg); | |
1263 | case VHOST_USER_SET_VRING_NUM: | |
1264 | return vu_set_vring_num_exec(dev, vmsg); | |
1265 | case VHOST_USER_SET_VRING_ADDR: | |
1266 | return vu_set_vring_addr_exec(dev, vmsg); | |
1267 | case VHOST_USER_SET_VRING_BASE: | |
1268 | return vu_set_vring_base_exec(dev, vmsg); | |
1269 | case VHOST_USER_GET_VRING_BASE: | |
1270 | return vu_get_vring_base_exec(dev, vmsg); | |
1271 | case VHOST_USER_SET_VRING_KICK: | |
1272 | return vu_set_vring_kick_exec(dev, vmsg); | |
1273 | case VHOST_USER_SET_VRING_CALL: | |
1274 | return vu_set_vring_call_exec(dev, vmsg); | |
1275 | case VHOST_USER_SET_VRING_ERR: | |
1276 | return vu_set_vring_err_exec(dev, vmsg); | |
1277 | case VHOST_USER_GET_QUEUE_NUM: | |
1278 | return vu_get_queue_num_exec(dev, vmsg); | |
1279 | case VHOST_USER_SET_VRING_ENABLE: | |
1280 | return vu_set_vring_enable_exec(dev, vmsg); | |
13384f15 DDAG |
1281 | case VHOST_USER_SET_SLAVE_REQ_FD: |
1282 | return vu_set_slave_req_fd(dev, vmsg); | |
0bc24d83 CL |
1283 | case VHOST_USER_GET_CONFIG: |
1284 | return vu_get_config(dev, vmsg); | |
1285 | case VHOST_USER_SET_CONFIG: | |
1286 | return vu_set_config(dev, vmsg); | |
2566378d | 1287 | case VHOST_USER_NONE: |
60798655 MAL |
1288 | /* if you need processing before exit, override iface->process_msg */ |
1289 | exit(0); | |
d3dff7a5 DDAG |
1290 | case VHOST_USER_POSTCOPY_ADVISE: |
1291 | return vu_set_postcopy_advise(dev, vmsg); | |
6864a7b5 DDAG |
1292 | case VHOST_USER_POSTCOPY_LISTEN: |
1293 | return vu_set_postcopy_listen(dev, vmsg); | |
c639187e DDAG |
1294 | case VHOST_USER_POSTCOPY_END: |
1295 | return vu_set_postcopy_end(dev, vmsg); | |
7b2e5c65 MAL |
1296 | default: |
1297 | vmsg_close_fds(vmsg); | |
1298 | vu_panic(dev, "Unhandled request: %d", vmsg->request); | |
1299 | } | |
1300 | ||
1301 | return false; | |
1302 | } | |
1303 | ||
1304 | bool | |
1305 | vu_dispatch(VuDev *dev) | |
1306 | { | |
1307 | VhostUserMsg vmsg = { 0, }; | |
1308 | int reply_requested; | |
1309 | bool success = false; | |
1310 | ||
1311 | if (!vu_message_read(dev, dev->sock, &vmsg)) { | |
1312 | goto end; | |
1313 | } | |
1314 | ||
1315 | reply_requested = vu_process_message(dev, &vmsg); | |
1316 | if (!reply_requested) { | |
1317 | success = true; | |
1318 | goto end; | |
1319 | } | |
1320 | ||
d84599f5 | 1321 | if (!vu_send_reply(dev, dev->sock, &vmsg)) { |
7b2e5c65 MAL |
1322 | goto end; |
1323 | } | |
1324 | ||
1325 | success = true; | |
1326 | ||
1327 | end: | |
eb078a9f | 1328 | free(vmsg.data); |
7b2e5c65 MAL |
1329 | return success; |
1330 | } | |
1331 | ||
1332 | void | |
1333 | vu_deinit(VuDev *dev) | |
1334 | { | |
1335 | int i; | |
1336 | ||
1337 | for (i = 0; i < dev->nregions; i++) { | |
1338 | VuDevRegion *r = &dev->regions[i]; | |
1339 | void *m = (void *) (uintptr_t) r->mmap_addr; | |
1340 | if (m != MAP_FAILED) { | |
1341 | munmap(m, r->size + r->mmap_offset); | |
1342 | } | |
1343 | } | |
1344 | dev->nregions = 0; | |
1345 | ||
1346 | for (i = 0; i < VHOST_MAX_NR_VIRTQUEUE; i++) { | |
1347 | VuVirtq *vq = &dev->vq[i]; | |
1348 | ||
1349 | if (vq->call_fd != -1) { | |
1350 | close(vq->call_fd); | |
1351 | vq->call_fd = -1; | |
1352 | } | |
1353 | ||
1354 | if (vq->kick_fd != -1) { | |
1355 | close(vq->kick_fd); | |
1356 | vq->kick_fd = -1; | |
1357 | } | |
1358 | ||
1359 | if (vq->err_fd != -1) { | |
1360 | close(vq->err_fd); | |
1361 | vq->err_fd = -1; | |
1362 | } | |
1363 | } | |
1364 | ||
1365 | ||
1366 | vu_close_log(dev); | |
13384f15 DDAG |
1367 | if (dev->slave_fd != -1) { |
1368 | close(dev->slave_fd); | |
1369 | dev->slave_fd = -1; | |
1370 | } | |
7b2e5c65 MAL |
1371 | |
1372 | if (dev->sock != -1) { | |
1373 | close(dev->sock); | |
1374 | } | |
1375 | } | |
1376 | ||
1377 | void | |
1378 | vu_init(VuDev *dev, | |
1379 | int socket, | |
1380 | vu_panic_cb panic, | |
1381 | vu_set_watch_cb set_watch, | |
1382 | vu_remove_watch_cb remove_watch, | |
1383 | const VuDevIface *iface) | |
1384 | { | |
1385 | int i; | |
1386 | ||
1387 | assert(socket >= 0); | |
1388 | assert(set_watch); | |
1389 | assert(remove_watch); | |
1390 | assert(iface); | |
1391 | assert(panic); | |
1392 | ||
1393 | memset(dev, 0, sizeof(*dev)); | |
1394 | ||
1395 | dev->sock = socket; | |
1396 | dev->panic = panic; | |
1397 | dev->set_watch = set_watch; | |
1398 | dev->remove_watch = remove_watch; | |
1399 | dev->iface = iface; | |
1400 | dev->log_call_fd = -1; | |
13384f15 | 1401 | dev->slave_fd = -1; |
7b2e5c65 MAL |
1402 | for (i = 0; i < VHOST_MAX_NR_VIRTQUEUE; i++) { |
1403 | dev->vq[i] = (VuVirtq) { | |
1404 | .call_fd = -1, .kick_fd = -1, .err_fd = -1, | |
1405 | .notification = true, | |
1406 | }; | |
1407 | } | |
1408 | } | |
1409 | ||
1410 | VuVirtq * | |
1411 | vu_get_queue(VuDev *dev, int qidx) | |
1412 | { | |
1413 | assert(qidx < VHOST_MAX_NR_VIRTQUEUE); | |
1414 | return &dev->vq[qidx]; | |
1415 | } | |
1416 | ||
1417 | bool | |
1418 | vu_queue_enabled(VuDev *dev, VuVirtq *vq) | |
1419 | { | |
1420 | return vq->enable; | |
1421 | } | |
1422 | ||
bcf0836d DDAG |
1423 | bool |
1424 | vu_queue_started(const VuDev *dev, const VuVirtq *vq) | |
1425 | { | |
1426 | return vq->started; | |
1427 | } | |
1428 | ||
7b2e5c65 MAL |
1429 | static inline uint16_t |
1430 | vring_avail_flags(VuVirtq *vq) | |
1431 | { | |
1432 | return vq->vring.avail->flags; | |
1433 | } | |
1434 | ||
1435 | static inline uint16_t | |
1436 | vring_avail_idx(VuVirtq *vq) | |
1437 | { | |
1438 | vq->shadow_avail_idx = vq->vring.avail->idx; | |
1439 | ||
1440 | return vq->shadow_avail_idx; | |
1441 | } | |
1442 | ||
1443 | static inline uint16_t | |
1444 | vring_avail_ring(VuVirtq *vq, int i) | |
1445 | { | |
1446 | return vq->vring.avail->ring[i]; | |
1447 | } | |
1448 | ||
1449 | static inline uint16_t | |
1450 | vring_get_used_event(VuVirtq *vq) | |
1451 | { | |
1452 | return vring_avail_ring(vq, vq->vring.num); | |
1453 | } | |
1454 | ||
1455 | static int | |
1456 | virtqueue_num_heads(VuDev *dev, VuVirtq *vq, unsigned int idx) | |
1457 | { | |
1458 | uint16_t num_heads = vring_avail_idx(vq) - idx; | |
1459 | ||
1460 | /* Check it isn't doing very strange things with descriptor numbers. */ | |
1461 | if (num_heads > vq->vring.num) { | |
1462 | vu_panic(dev, "Guest moved used index from %u to %u", | |
1463 | idx, vq->shadow_avail_idx); | |
1464 | return -1; | |
1465 | } | |
1466 | if (num_heads) { | |
1467 | /* On success, callers read a descriptor at vq->last_avail_idx. | |
1468 | * Make sure descriptor read does not bypass avail index read. */ | |
1469 | smp_rmb(); | |
1470 | } | |
1471 | ||
1472 | return num_heads; | |
1473 | } | |
1474 | ||
1475 | static bool | |
1476 | virtqueue_get_head(VuDev *dev, VuVirtq *vq, | |
1477 | unsigned int idx, unsigned int *head) | |
1478 | { | |
1479 | /* Grab the next descriptor number they're advertising, and increment | |
1480 | * the index we've seen. */ | |
1481 | *head = vring_avail_ring(vq, idx % vq->vring.num); | |
1482 | ||
1483 | /* If their number is silly, that's a fatal mistake. */ | |
1484 | if (*head >= vq->vring.num) { | |
1485 | vu_panic(dev, "Guest says index %u is available", head); | |
1486 | return false; | |
1487 | } | |
1488 | ||
1489 | return true; | |
1490 | } | |
1491 | ||
293084a7 YX |
1492 | static int |
1493 | virtqueue_read_indirect_desc(VuDev *dev, struct vring_desc *desc, | |
1494 | uint64_t addr, size_t len) | |
1495 | { | |
1496 | struct vring_desc *ori_desc; | |
1497 | uint64_t read_len; | |
1498 | ||
1499 | if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) { | |
1500 | return -1; | |
1501 | } | |
1502 | ||
1503 | if (len == 0) { | |
1504 | return -1; | |
1505 | } | |
1506 | ||
1507 | while (len) { | |
1508 | read_len = len; | |
1509 | ori_desc = vu_gpa_to_va(dev, &read_len, addr); | |
1510 | if (!ori_desc) { | |
1511 | return -1; | |
1512 | } | |
1513 | ||
1514 | memcpy(desc, ori_desc, read_len); | |
1515 | len -= read_len; | |
1516 | addr += read_len; | |
1517 | desc += read_len; | |
1518 | } | |
1519 | ||
1520 | return 0; | |
1521 | } | |
1522 | ||
7b2e5c65 MAL |
1523 | enum { |
1524 | VIRTQUEUE_READ_DESC_ERROR = -1, | |
1525 | VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */ | |
1526 | VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */ | |
1527 | }; | |
1528 | ||
1529 | static int | |
1530 | virtqueue_read_next_desc(VuDev *dev, struct vring_desc *desc, | |
1531 | int i, unsigned int max, unsigned int *next) | |
1532 | { | |
1533 | /* If this descriptor says it doesn't chain, we're done. */ | |
1534 | if (!(desc[i].flags & VRING_DESC_F_NEXT)) { | |
1535 | return VIRTQUEUE_READ_DESC_DONE; | |
1536 | } | |
1537 | ||
1538 | /* Check they're not leading us off end of descriptors. */ | |
1539 | *next = desc[i].next; | |
1540 | /* Make sure compiler knows to grab that: we don't want it changing! */ | |
1541 | smp_wmb(); | |
1542 | ||
1543 | if (*next >= max) { | |
1544 | vu_panic(dev, "Desc next is %u", next); | |
1545 | return VIRTQUEUE_READ_DESC_ERROR; | |
1546 | } | |
1547 | ||
1548 | return VIRTQUEUE_READ_DESC_MORE; | |
1549 | } | |
1550 | ||
1551 | void | |
1552 | vu_queue_get_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int *in_bytes, | |
1553 | unsigned int *out_bytes, | |
1554 | unsigned max_in_bytes, unsigned max_out_bytes) | |
1555 | { | |
1556 | unsigned int idx; | |
1557 | unsigned int total_bufs, in_total, out_total; | |
1558 | int rc; | |
1559 | ||
1560 | idx = vq->last_avail_idx; | |
1561 | ||
1562 | total_bufs = in_total = out_total = 0; | |
640601c7 MAL |
1563 | if (unlikely(dev->broken) || |
1564 | unlikely(!vq->vring.avail)) { | |
1565 | goto done; | |
1566 | } | |
1567 | ||
7b2e5c65 | 1568 | while ((rc = virtqueue_num_heads(dev, vq, idx)) > 0) { |
293084a7 YX |
1569 | unsigned int max, desc_len, num_bufs, indirect = 0; |
1570 | uint64_t desc_addr, read_len; | |
7b2e5c65 | 1571 | struct vring_desc *desc; |
293084a7 | 1572 | struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; |
7b2e5c65 MAL |
1573 | unsigned int i; |
1574 | ||
1575 | max = vq->vring.num; | |
1576 | num_bufs = total_bufs; | |
1577 | if (!virtqueue_get_head(dev, vq, idx++, &i)) { | |
1578 | goto err; | |
1579 | } | |
1580 | desc = vq->vring.desc; | |
1581 | ||
1582 | if (desc[i].flags & VRING_DESC_F_INDIRECT) { | |
1583 | if (desc[i].len % sizeof(struct vring_desc)) { | |
1584 | vu_panic(dev, "Invalid size for indirect buffer table"); | |
1585 | goto err; | |
1586 | } | |
1587 | ||
1588 | /* If we've got too many, that implies a descriptor loop. */ | |
1589 | if (num_bufs >= max) { | |
1590 | vu_panic(dev, "Looped descriptor"); | |
1591 | goto err; | |
1592 | } | |
1593 | ||
1594 | /* loop over the indirect descriptor table */ | |
1595 | indirect = 1; | |
293084a7 YX |
1596 | desc_addr = desc[i].addr; |
1597 | desc_len = desc[i].len; | |
1598 | max = desc_len / sizeof(struct vring_desc); | |
1599 | read_len = desc_len; | |
1600 | desc = vu_gpa_to_va(dev, &read_len, desc_addr); | |
1601 | if (unlikely(desc && read_len != desc_len)) { | |
1602 | /* Failed to use zero copy */ | |
1603 | desc = NULL; | |
1604 | if (!virtqueue_read_indirect_desc(dev, desc_buf, | |
1605 | desc_addr, | |
1606 | desc_len)) { | |
1607 | desc = desc_buf; | |
1608 | } | |
1609 | } | |
1610 | if (!desc) { | |
1611 | vu_panic(dev, "Invalid indirect buffer table"); | |
1612 | goto err; | |
1613 | } | |
7b2e5c65 MAL |
1614 | num_bufs = i = 0; |
1615 | } | |
1616 | ||
1617 | do { | |
1618 | /* If we've got too many, that implies a descriptor loop. */ | |
1619 | if (++num_bufs > max) { | |
1620 | vu_panic(dev, "Looped descriptor"); | |
1621 | goto err; | |
1622 | } | |
1623 | ||
1624 | if (desc[i].flags & VRING_DESC_F_WRITE) { | |
1625 | in_total += desc[i].len; | |
1626 | } else { | |
1627 | out_total += desc[i].len; | |
1628 | } | |
1629 | if (in_total >= max_in_bytes && out_total >= max_out_bytes) { | |
1630 | goto done; | |
1631 | } | |
1632 | rc = virtqueue_read_next_desc(dev, desc, i, max, &i); | |
1633 | } while (rc == VIRTQUEUE_READ_DESC_MORE); | |
1634 | ||
1635 | if (rc == VIRTQUEUE_READ_DESC_ERROR) { | |
1636 | goto err; | |
1637 | } | |
1638 | ||
1639 | if (!indirect) { | |
1640 | total_bufs = num_bufs; | |
1641 | } else { | |
1642 | total_bufs++; | |
1643 | } | |
1644 | } | |
1645 | if (rc < 0) { | |
1646 | goto err; | |
1647 | } | |
1648 | done: | |
1649 | if (in_bytes) { | |
1650 | *in_bytes = in_total; | |
1651 | } | |
1652 | if (out_bytes) { | |
1653 | *out_bytes = out_total; | |
1654 | } | |
1655 | return; | |
1656 | ||
1657 | err: | |
1658 | in_total = out_total = 0; | |
1659 | goto done; | |
1660 | } | |
1661 | ||
1662 | bool | |
1663 | vu_queue_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int in_bytes, | |
1664 | unsigned int out_bytes) | |
1665 | { | |
1666 | unsigned int in_total, out_total; | |
1667 | ||
1668 | vu_queue_get_avail_bytes(dev, vq, &in_total, &out_total, | |
1669 | in_bytes, out_bytes); | |
1670 | ||
1671 | return in_bytes <= in_total && out_bytes <= out_total; | |
1672 | } | |
1673 | ||
1674 | /* Fetch avail_idx from VQ memory only when we really need to know if | |
1675 | * guest has added some buffers. */ | |
640601c7 | 1676 | bool |
7b2e5c65 MAL |
1677 | vu_queue_empty(VuDev *dev, VuVirtq *vq) |
1678 | { | |
640601c7 MAL |
1679 | if (unlikely(dev->broken) || |
1680 | unlikely(!vq->vring.avail)) { | |
1681 | return true; | |
1682 | } | |
1683 | ||
7b2e5c65 | 1684 | if (vq->shadow_avail_idx != vq->last_avail_idx) { |
640601c7 | 1685 | return false; |
7b2e5c65 MAL |
1686 | } |
1687 | ||
1688 | return vring_avail_idx(vq) == vq->last_avail_idx; | |
1689 | } | |
1690 | ||
1691 | static inline | |
1692 | bool has_feature(uint64_t features, unsigned int fbit) | |
1693 | { | |
1694 | assert(fbit < 64); | |
1695 | return !!(features & (1ULL << fbit)); | |
1696 | } | |
1697 | ||
1698 | static inline | |
1699 | bool vu_has_feature(VuDev *dev, | |
1700 | unsigned int fbit) | |
1701 | { | |
1702 | return has_feature(dev->features, fbit); | |
1703 | } | |
1704 | ||
1705 | static bool | |
1706 | vring_notify(VuDev *dev, VuVirtq *vq) | |
1707 | { | |
1708 | uint16_t old, new; | |
1709 | bool v; | |
1710 | ||
1711 | /* We need to expose used array entries before checking used event. */ | |
1712 | smp_mb(); | |
1713 | ||
1714 | /* Always notify when queue is empty (when feature acknowledge) */ | |
1715 | if (vu_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) && | |
1716 | !vq->inuse && vu_queue_empty(dev, vq)) { | |
1717 | return true; | |
1718 | } | |
1719 | ||
1720 | if (!vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { | |
1721 | return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT); | |
1722 | } | |
1723 | ||
1724 | v = vq->signalled_used_valid; | |
1725 | vq->signalled_used_valid = true; | |
1726 | old = vq->signalled_used; | |
1727 | new = vq->signalled_used = vq->used_idx; | |
1728 | return !v || vring_need_event(vring_get_used_event(vq), new, old); | |
1729 | } | |
1730 | ||
1731 | void | |
1732 | vu_queue_notify(VuDev *dev, VuVirtq *vq) | |
1733 | { | |
640601c7 MAL |
1734 | if (unlikely(dev->broken) || |
1735 | unlikely(!vq->vring.avail)) { | |
7b2e5c65 MAL |
1736 | return; |
1737 | } | |
1738 | ||
1739 | if (!vring_notify(dev, vq)) { | |
1740 | DPRINT("skipped notify...\n"); | |
1741 | return; | |
1742 | } | |
1743 | ||
1744 | if (eventfd_write(vq->call_fd, 1) < 0) { | |
1745 | vu_panic(dev, "Error writing eventfd: %s", strerror(errno)); | |
1746 | } | |
1747 | } | |
1748 | ||
1749 | static inline void | |
1750 | vring_used_flags_set_bit(VuVirtq *vq, int mask) | |
1751 | { | |
1752 | uint16_t *flags; | |
1753 | ||
1754 | flags = (uint16_t *)((char*)vq->vring.used + | |
1755 | offsetof(struct vring_used, flags)); | |
1756 | *flags |= mask; | |
1757 | } | |
1758 | ||
1759 | static inline void | |
1760 | vring_used_flags_unset_bit(VuVirtq *vq, int mask) | |
1761 | { | |
1762 | uint16_t *flags; | |
1763 | ||
1764 | flags = (uint16_t *)((char*)vq->vring.used + | |
1765 | offsetof(struct vring_used, flags)); | |
1766 | *flags &= ~mask; | |
1767 | } | |
1768 | ||
1769 | static inline void | |
1770 | vring_set_avail_event(VuVirtq *vq, uint16_t val) | |
1771 | { | |
1772 | if (!vq->notification) { | |
1773 | return; | |
1774 | } | |
1775 | ||
1776 | *((uint16_t *) &vq->vring.used->ring[vq->vring.num]) = val; | |
1777 | } | |
1778 | ||
1779 | void | |
1780 | vu_queue_set_notification(VuDev *dev, VuVirtq *vq, int enable) | |
1781 | { | |
1782 | vq->notification = enable; | |
1783 | if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { | |
1784 | vring_set_avail_event(vq, vring_avail_idx(vq)); | |
1785 | } else if (enable) { | |
1786 | vring_used_flags_unset_bit(vq, VRING_USED_F_NO_NOTIFY); | |
1787 | } else { | |
1788 | vring_used_flags_set_bit(vq, VRING_USED_F_NO_NOTIFY); | |
1789 | } | |
1790 | if (enable) { | |
1791 | /* Expose avail event/used flags before caller checks the avail idx. */ | |
1792 | smp_mb(); | |
1793 | } | |
1794 | } | |
1795 | ||
1796 | static void | |
1797 | virtqueue_map_desc(VuDev *dev, | |
1798 | unsigned int *p_num_sg, struct iovec *iov, | |
1799 | unsigned int max_num_sg, bool is_write, | |
1800 | uint64_t pa, size_t sz) | |
1801 | { | |
1802 | unsigned num_sg = *p_num_sg; | |
1803 | ||
1804 | assert(num_sg <= max_num_sg); | |
1805 | ||
1806 | if (!sz) { | |
1807 | vu_panic(dev, "virtio: zero sized buffers are not allowed"); | |
1808 | return; | |
1809 | } | |
1810 | ||
293084a7 YX |
1811 | while (sz) { |
1812 | uint64_t len = sz; | |
1813 | ||
1814 | if (num_sg == max_num_sg) { | |
1815 | vu_panic(dev, "virtio: too many descriptors in indirect table"); | |
1816 | return; | |
1817 | } | |
1818 | ||
1819 | iov[num_sg].iov_base = vu_gpa_to_va(dev, &len, pa); | |
1820 | if (iov[num_sg].iov_base == NULL) { | |
1821 | vu_panic(dev, "virtio: invalid address for buffers"); | |
1822 | return; | |
1823 | } | |
1824 | iov[num_sg].iov_len = len; | |
1825 | num_sg++; | |
1826 | sz -= len; | |
1827 | pa += len; | |
1828 | } | |
7b2e5c65 MAL |
1829 | |
1830 | *p_num_sg = num_sg; | |
1831 | } | |
1832 | ||
1833 | /* Round number down to multiple */ | |
1834 | #define ALIGN_DOWN(n, m) ((n) / (m) * (m)) | |
1835 | ||
1836 | /* Round number up to multiple */ | |
1837 | #define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) | |
1838 | ||
1839 | static void * | |
1840 | virtqueue_alloc_element(size_t sz, | |
1841 | unsigned out_num, unsigned in_num) | |
1842 | { | |
1843 | VuVirtqElement *elem; | |
1844 | size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0])); | |
1845 | size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]); | |
1846 | size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]); | |
1847 | ||
1848 | assert(sz >= sizeof(VuVirtqElement)); | |
1849 | elem = malloc(out_sg_end); | |
1850 | elem->out_num = out_num; | |
1851 | elem->in_num = in_num; | |
1852 | elem->in_sg = (void *)elem + in_sg_ofs; | |
1853 | elem->out_sg = (void *)elem + out_sg_ofs; | |
1854 | return elem; | |
1855 | } | |
1856 | ||
1857 | void * | |
1858 | vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz) | |
1859 | { | |
293084a7 YX |
1860 | unsigned int i, head, max, desc_len; |
1861 | uint64_t desc_addr, read_len; | |
7b2e5c65 MAL |
1862 | VuVirtqElement *elem; |
1863 | unsigned out_num, in_num; | |
1864 | struct iovec iov[VIRTQUEUE_MAX_SIZE]; | |
293084a7 | 1865 | struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; |
7b2e5c65 MAL |
1866 | struct vring_desc *desc; |
1867 | int rc; | |
1868 | ||
640601c7 MAL |
1869 | if (unlikely(dev->broken) || |
1870 | unlikely(!vq->vring.avail)) { | |
7b2e5c65 MAL |
1871 | return NULL; |
1872 | } | |
1873 | ||
1874 | if (vu_queue_empty(dev, vq)) { | |
1875 | return NULL; | |
1876 | } | |
1877 | /* Needed after virtio_queue_empty(), see comment in | |
1878 | * virtqueue_num_heads(). */ | |
1879 | smp_rmb(); | |
1880 | ||
1881 | /* When we start there are none of either input nor output. */ | |
1882 | out_num = in_num = 0; | |
1883 | ||
1884 | max = vq->vring.num; | |
1885 | if (vq->inuse >= vq->vring.num) { | |
1886 | vu_panic(dev, "Virtqueue size exceeded"); | |
1887 | return NULL; | |
1888 | } | |
1889 | ||
1890 | if (!virtqueue_get_head(dev, vq, vq->last_avail_idx++, &head)) { | |
1891 | return NULL; | |
1892 | } | |
1893 | ||
1894 | if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { | |
1895 | vring_set_avail_event(vq, vq->last_avail_idx); | |
1896 | } | |
1897 | ||
1898 | i = head; | |
1899 | desc = vq->vring.desc; | |
1900 | if (desc[i].flags & VRING_DESC_F_INDIRECT) { | |
1901 | if (desc[i].len % sizeof(struct vring_desc)) { | |
1902 | vu_panic(dev, "Invalid size for indirect buffer table"); | |
1903 | } | |
1904 | ||
1905 | /* loop over the indirect descriptor table */ | |
293084a7 YX |
1906 | desc_addr = desc[i].addr; |
1907 | desc_len = desc[i].len; | |
1908 | max = desc_len / sizeof(struct vring_desc); | |
1909 | read_len = desc_len; | |
1910 | desc = vu_gpa_to_va(dev, &read_len, desc_addr); | |
1911 | if (unlikely(desc && read_len != desc_len)) { | |
1912 | /* Failed to use zero copy */ | |
1913 | desc = NULL; | |
1914 | if (!virtqueue_read_indirect_desc(dev, desc_buf, | |
1915 | desc_addr, | |
1916 | desc_len)) { | |
1917 | desc = desc_buf; | |
1918 | } | |
1919 | } | |
1920 | if (!desc) { | |
1921 | vu_panic(dev, "Invalid indirect buffer table"); | |
1922 | return NULL; | |
1923 | } | |
7b2e5c65 MAL |
1924 | i = 0; |
1925 | } | |
1926 | ||
1927 | /* Collect all the descriptors */ | |
1928 | do { | |
1929 | if (desc[i].flags & VRING_DESC_F_WRITE) { | |
1930 | virtqueue_map_desc(dev, &in_num, iov + out_num, | |
1931 | VIRTQUEUE_MAX_SIZE - out_num, true, | |
1932 | desc[i].addr, desc[i].len); | |
1933 | } else { | |
1934 | if (in_num) { | |
1935 | vu_panic(dev, "Incorrect order for descriptors"); | |
1936 | return NULL; | |
1937 | } | |
1938 | virtqueue_map_desc(dev, &out_num, iov, | |
1939 | VIRTQUEUE_MAX_SIZE, false, | |
1940 | desc[i].addr, desc[i].len); | |
1941 | } | |
1942 | ||
1943 | /* If we've got too many, that implies a descriptor loop. */ | |
1944 | if ((in_num + out_num) > max) { | |
1945 | vu_panic(dev, "Looped descriptor"); | |
1946 | } | |
1947 | rc = virtqueue_read_next_desc(dev, desc, i, max, &i); | |
1948 | } while (rc == VIRTQUEUE_READ_DESC_MORE); | |
1949 | ||
1950 | if (rc == VIRTQUEUE_READ_DESC_ERROR) { | |
1951 | return NULL; | |
1952 | } | |
1953 | ||
1954 | /* Now copy what we have collected and mapped */ | |
1955 | elem = virtqueue_alloc_element(sz, out_num, in_num); | |
1956 | elem->index = head; | |
1957 | for (i = 0; i < out_num; i++) { | |
1958 | elem->out_sg[i] = iov[i]; | |
1959 | } | |
1960 | for (i = 0; i < in_num; i++) { | |
1961 | elem->in_sg[i] = iov[out_num + i]; | |
1962 | } | |
1963 | ||
1964 | vq->inuse++; | |
1965 | ||
1966 | return elem; | |
1967 | } | |
1968 | ||
1969 | bool | |
1970 | vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num) | |
1971 | { | |
1972 | if (num > vq->inuse) { | |
1973 | return false; | |
1974 | } | |
1975 | vq->last_avail_idx -= num; | |
1976 | vq->inuse -= num; | |
1977 | return true; | |
1978 | } | |
1979 | ||
1980 | static inline | |
1981 | void vring_used_write(VuDev *dev, VuVirtq *vq, | |
1982 | struct vring_used_elem *uelem, int i) | |
1983 | { | |
1984 | struct vring_used *used = vq->vring.used; | |
1985 | ||
1986 | used->ring[i] = *uelem; | |
1987 | vu_log_write(dev, vq->vring.log_guest_addr + | |
1988 | offsetof(struct vring_used, ring[i]), | |
1989 | sizeof(used->ring[i])); | |
1990 | } | |
1991 | ||
1992 | ||
1993 | static void | |
1994 | vu_log_queue_fill(VuDev *dev, VuVirtq *vq, | |
1995 | const VuVirtqElement *elem, | |
1996 | unsigned int len) | |
1997 | { | |
1998 | struct vring_desc *desc = vq->vring.desc; | |
293084a7 YX |
1999 | unsigned int i, max, min, desc_len; |
2000 | uint64_t desc_addr, read_len; | |
2001 | struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; | |
7b2e5c65 MAL |
2002 | unsigned num_bufs = 0; |
2003 | ||
2004 | max = vq->vring.num; | |
2005 | i = elem->index; | |
2006 | ||
2007 | if (desc[i].flags & VRING_DESC_F_INDIRECT) { | |
2008 | if (desc[i].len % sizeof(struct vring_desc)) { | |
2009 | vu_panic(dev, "Invalid size for indirect buffer table"); | |
2010 | } | |
2011 | ||
2012 | /* loop over the indirect descriptor table */ | |
293084a7 YX |
2013 | desc_addr = desc[i].addr; |
2014 | desc_len = desc[i].len; | |
2015 | max = desc_len / sizeof(struct vring_desc); | |
2016 | read_len = desc_len; | |
2017 | desc = vu_gpa_to_va(dev, &read_len, desc_addr); | |
2018 | if (unlikely(desc && read_len != desc_len)) { | |
2019 | /* Failed to use zero copy */ | |
2020 | desc = NULL; | |
2021 | if (!virtqueue_read_indirect_desc(dev, desc_buf, | |
2022 | desc_addr, | |
2023 | desc_len)) { | |
2024 | desc = desc_buf; | |
2025 | } | |
2026 | } | |
2027 | if (!desc) { | |
2028 | vu_panic(dev, "Invalid indirect buffer table"); | |
2029 | return; | |
2030 | } | |
7b2e5c65 MAL |
2031 | i = 0; |
2032 | } | |
2033 | ||
2034 | do { | |
2035 | if (++num_bufs > max) { | |
2036 | vu_panic(dev, "Looped descriptor"); | |
2037 | return; | |
2038 | } | |
2039 | ||
2040 | if (desc[i].flags & VRING_DESC_F_WRITE) { | |
2041 | min = MIN(desc[i].len, len); | |
2042 | vu_log_write(dev, desc[i].addr, min); | |
2043 | len -= min; | |
2044 | } | |
2045 | ||
2046 | } while (len > 0 && | |
2047 | (virtqueue_read_next_desc(dev, desc, i, max, &i) | |
2048 | == VIRTQUEUE_READ_DESC_MORE)); | |
2049 | } | |
2050 | ||
2051 | void | |
2052 | vu_queue_fill(VuDev *dev, VuVirtq *vq, | |
2053 | const VuVirtqElement *elem, | |
2054 | unsigned int len, unsigned int idx) | |
2055 | { | |
2056 | struct vring_used_elem uelem; | |
2057 | ||
640601c7 MAL |
2058 | if (unlikely(dev->broken) || |
2059 | unlikely(!vq->vring.avail)) { | |
7b2e5c65 MAL |
2060 | return; |
2061 | } | |
2062 | ||
2063 | vu_log_queue_fill(dev, vq, elem, len); | |
2064 | ||
2065 | idx = (idx + vq->used_idx) % vq->vring.num; | |
2066 | ||
2067 | uelem.id = elem->index; | |
2068 | uelem.len = len; | |
2069 | vring_used_write(dev, vq, &uelem, idx); | |
2070 | } | |
2071 | ||
2072 | static inline | |
2073 | void vring_used_idx_set(VuDev *dev, VuVirtq *vq, uint16_t val) | |
2074 | { | |
2075 | vq->vring.used->idx = val; | |
2076 | vu_log_write(dev, | |
2077 | vq->vring.log_guest_addr + offsetof(struct vring_used, idx), | |
2078 | sizeof(vq->vring.used->idx)); | |
2079 | ||
2080 | vq->used_idx = val; | |
2081 | } | |
2082 | ||
2083 | void | |
2084 | vu_queue_flush(VuDev *dev, VuVirtq *vq, unsigned int count) | |
2085 | { | |
2086 | uint16_t old, new; | |
2087 | ||
640601c7 MAL |
2088 | if (unlikely(dev->broken) || |
2089 | unlikely(!vq->vring.avail)) { | |
7b2e5c65 MAL |
2090 | return; |
2091 | } | |
2092 | ||
2093 | /* Make sure buffer is written before we update index. */ | |
2094 | smp_wmb(); | |
2095 | ||
2096 | old = vq->used_idx; | |
2097 | new = old + count; | |
2098 | vring_used_idx_set(dev, vq, new); | |
2099 | vq->inuse -= count; | |
2100 | if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) { | |
2101 | vq->signalled_used_valid = false; | |
2102 | } | |
2103 | } | |
2104 | ||
2105 | void | |
2106 | vu_queue_push(VuDev *dev, VuVirtq *vq, | |
2107 | const VuVirtqElement *elem, unsigned int len) | |
2108 | { | |
2109 | vu_queue_fill(dev, vq, elem, len, 0); | |
2110 | vu_queue_flush(dev, vq, 1); | |
2111 | } |