]> git.proxmox.com Git - ceph.git/blame - ceph/src/spdk/lib/rte_vhost/rte_vhost.h
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / lib / rte_vhost / rte_vhost.h
CommitLineData
11fdf7f2
TL
1/*-
2 * BSD LICENSE
3 *
4 * Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34#ifndef _RTE_VHOST_H_
35#define _RTE_VHOST_H_
36
37/**
38 * @file
39 * Interface to vhost-user
40 */
41
42#include <stdint.h>
43#include <linux/vhost.h>
44#include <linux/virtio_ring.h>
45#include <sys/eventfd.h>
46
47#include <rte_config.h>
48#include <rte_memory.h>
49#include <rte_mempool.h>
50
51#define RTE_VHOST_USER_CLIENT (1ULL << 0)
52#define RTE_VHOST_USER_NO_RECONNECT (1ULL << 1)
53#define RTE_VHOST_USER_DEQUEUE_ZERO_COPY (1ULL << 2)
54
55/**
56 * Information relating to memory regions including offsets to
57 * addresses in QEMUs memory file.
58 */
59struct rte_vhost_mem_region {
60 uint64_t guest_phys_addr;
61 uint64_t guest_user_addr;
62 uint64_t host_user_addr;
63 uint64_t size;
64 void *mmap_addr;
65 uint64_t mmap_size;
66 int fd;
67};
68
69/**
70 * Memory structure includes region and mapping information.
71 */
72struct rte_vhost_memory {
73 uint32_t nregions;
74 struct rte_vhost_mem_region regions[0];
75};
76
f67539c2
TL
77struct rte_vhost_inflight_desc_split {
78 uint8_t inflight;
79 uint8_t padding[5];
80 uint16_t next;
81 uint64_t counter;
82};
83
84struct rte_vhost_inflight_info_split {
85 uint64_t features;
86 uint16_t version;
87 uint16_t desc_num;
88 uint16_t last_inflight_io;
89 uint16_t used_idx;
90 struct rte_vhost_inflight_desc_split desc[0];
91};
92
93struct rte_vhost_resubmit_desc {
94 uint16_t index;
95 uint64_t counter;
96};
97
98struct rte_vhost_resubmit_info {
99 struct rte_vhost_resubmit_desc *resubmit_list;
100 uint16_t resubmit_num;
101};
102
103struct rte_vhost_ring_inflight {
104 struct rte_vhost_inflight_info_split *inflight_split;
105 struct rte_vhost_resubmit_info *resubmit_inflight;
106};
107
11fdf7f2 108struct rte_vhost_vring {
f67539c2
TL
109 union {
110 struct vring_desc *desc;
111 struct vring_packed_desc *desc_packed;
112 };
113 union {
114 struct vring_avail *avail;
115 struct vring_packed_desc_event *driver_event;
116 };
117 union {
118 struct vring_used *used;
119 struct vring_packed_desc_event *device_event;
120 };
11fdf7f2
TL
121 uint64_t log_guest_addr;
122
123 int callfd;
124 int kickfd;
125 uint16_t size;
11fdf7f2
TL
126};
127
128/**
129 * Device and vring operations.
130 */
131struct vhost_device_ops {
132 int (*new_device)(int vid); /**< Add device. */
133 void (*destroy_device)(int vid); /**< Remove device. */
134
135 int (*vring_state_changed)(int vid, uint16_t queue_id, int enable); /**< triggered when a vring is enabled or disabled */
136
137 /**
138 * Features could be changed after the feature negotiation.
139 * For example, VHOST_F_LOG_ALL will be set/cleared at the
140 * start/end of live migration, respectively. This callback
141 * is used to inform the application on such change.
142 */
143 int (*features_changed)(int vid, uint64_t features);
144 int (*vhost_nvme_admin_passthrough)(int vid, void *cmd, void *cqe, void *buf);
145 int (*vhost_nvme_set_cq_call)(int vid, uint16_t qid, int fd);
9f95a23c 146 int (*vhost_nvme_set_bar_mr)(int vid, void *bar_addr, uint64_t bar_size);
11fdf7f2
TL
147 int (*vhost_nvme_get_cap)(int vid, uint64_t *cap);
148
149 int (*new_connection)(int vid);
150 void (*destroy_connection)(int vid);
151
152 int (*get_config)(int vid, uint8_t *config, uint32_t config_len);
153 int (*set_config)(int vid, uint8_t *config, uint32_t offset,
154 uint32_t len, uint32_t flags);
155
156 void *reserved[2]; /**< Reserved for future extension */
157};
158
159/**
160 * Convert guest physical address to host virtual address
161 *
162 * @param mem
163 * the guest memory regions
164 * @param gpa
165 * the guest physical address for querying
166 * @return
167 * the host virtual address on success, 0 on failure
168 */
169static inline uint64_t __attribute__((always_inline))
170rte_vhost_gpa_to_vva(struct rte_vhost_memory *mem, uint64_t gpa)
171{
172 struct rte_vhost_mem_region *reg;
173 uint32_t i;
174
175 for (i = 0; i < mem->nregions; i++) {
176 reg = &mem->regions[i];
177 if (gpa >= reg->guest_phys_addr &&
178 gpa < reg->guest_phys_addr + reg->size) {
179 return gpa - reg->guest_phys_addr +
180 reg->host_user_addr;
181 }
182 }
183
184 return 0;
185}
186
187/**
188 * Convert guest physical address to host virtual address safely
189 *
190 * This variant of rte_vhost_gpa_to_vva() takes care all the
191 * requested length is mapped and contiguous in process address
192 * space.
193 *
194 * @param mem
195 * the guest memory regions
196 * @param gpa
197 * the guest physical address for querying
198 * @param len
199 * the size of the requested area to map,
200 * updated with actual size mapped
201 * @return
202 * the host virtual address on success, 0 on failure */
203static inline uint64_t
204rte_vhost_va_from_guest_pa(struct rte_vhost_memory *mem,
205 uint64_t gpa, uint64_t *len)
206{
207 struct rte_vhost_mem_region *r;
208 uint32_t i;
209
210 for (i = 0; i < mem->nregions; i++) {
211 r = &mem->regions[i];
212 if (gpa >= r->guest_phys_addr &&
213 gpa < r->guest_phys_addr + r->size) {
214
215 if (unlikely(*len > r->guest_phys_addr + r->size - gpa))
216 *len = r->guest_phys_addr + r->size - gpa;
217
218 return gpa - r->guest_phys_addr +
219 r->host_user_addr;
220 }
221 }
222 *len = 0;
223
224 return 0;
225}
226
227#define RTE_VHOST_NEED_LOG(features) ((features) & (1ULL << VHOST_F_LOG_ALL))
228
229/**
230 * Log the memory write start with given address.
231 *
232 * This function only need be invoked when the live migration starts.
233 * Therefore, we won't need call it at all in the most of time. For
234 * making the performance impact be minimum, it's suggested to do a
235 * check before calling it:
236 *
237 * if (unlikely(RTE_VHOST_NEED_LOG(features)))
238 * rte_vhost_log_write(vid, addr, len);
239 *
240 * @param vid
241 * vhost device ID
242 * @param addr
243 * the starting address for write
244 * @param len
245 * the length to write
246 */
247void rte_vhost_log_write(int vid, uint64_t addr, uint64_t len);
248
249/**
250 * Log the used ring update start at given offset.
251 *
252 * Same as rte_vhost_log_write, it's suggested to do a check before
253 * calling it:
254 *
255 * if (unlikely(RTE_VHOST_NEED_LOG(features)))
256 * rte_vhost_log_used_vring(vid, vring_idx, offset, len);
257 *
258 * @param vid
259 * vhost device ID
260 * @param vring_idx
261 * the vring index
262 * @param offset
263 * the offset inside the used ring
264 * @param len
265 * the length to write
266 */
267void rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
268 uint64_t offset, uint64_t len);
269
270int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable);
271
272/**
273 * Register vhost driver. path could be different for multiple
274 * instance support.
275 */
276int rte_vhost_driver_register(const char *path, uint64_t flags);
277
278/* Unregister vhost driver. This is only meaningful to vhost user. */
279int rte_vhost_driver_unregister(const char *path);
280
281/**
282 * Set the feature bits the vhost-user driver supports.
283 *
284 * @param path
285 * The vhost-user socket file path
286 * @return
287 * 0 on success, -1 on failure
288 */
289int rte_vhost_driver_set_features(const char *path, uint64_t features);
290
291/**
292 * Enable vhost-user driver features.
293 *
294 * Note that
295 * - the param @features should be a subset of the feature bits provided
296 * by rte_vhost_driver_set_features().
297 * - it must be invoked before vhost-user negotiation starts.
298 *
299 * @param path
300 * The vhost-user socket file path
301 * @param features
302 * Features to enable
303 * @return
304 * 0 on success, -1 on failure
305 */
306int rte_vhost_driver_enable_features(const char *path, uint64_t features);
307
308/**
309 * Disable vhost-user driver features.
310 *
311 * The two notes at rte_vhost_driver_enable_features() also apply here.
312 *
313 * @param path
314 * The vhost-user socket file path
315 * @param features
316 * Features to disable
317 * @return
318 * 0 on success, -1 on failure
319 */
320int rte_vhost_driver_disable_features(const char *path, uint64_t features);
321
322/**
323 * Get the feature bits before feature negotiation.
324 *
325 * @param path
326 * The vhost-user socket file path
327 * @param features
328 * A pointer to store the queried feature bits
329 * @return
330 * 0 on success, -1 on failure
331 */
332int rte_vhost_driver_get_features(const char *path, uint64_t *features);
333
334/**
335 * Get the feature bits after negotiation
336 *
337 * @param vid
338 * Vhost device ID
339 * @param features
340 * A pointer to store the queried feature bits
341 * @return
342 * 0 on success, -1 on failure
343 */
344int rte_vhost_get_negotiated_features(int vid, uint64_t *features);
345
346/* Register callbacks. */
347int rte_vhost_driver_callback_register(const char *path,
348 struct vhost_device_ops const * const ops);
349
350/**
351 *
352 * Start the vhost-user driver.
353 *
354 * This function triggers the vhost-user negotiation.
355 *
356 * @param path
357 * The vhost-user socket file path
358 * @return
359 * 0 on success, -1 on failure
360 */
361int rte_vhost_driver_start(const char *path);
362
363/**
364 * Get the MTU value of the device if set in QEMU.
365 *
366 * @param vid
367 * virtio-net device ID
368 * @param mtu
369 * The variable to store the MTU value
370 *
371 * @return
372 * 0: success
373 * -EAGAIN: device not yet started
374 * -ENOTSUP: device does not support MTU feature
375 */
376int rte_vhost_get_mtu(int vid, uint16_t *mtu);
377
378/**
379 * Get the numa node from which the virtio net device's memory
380 * is allocated.
381 *
382 * @param vid
383 * vhost device ID
384 *
385 * @return
386 * The numa node, -1 on failure
387 */
388int rte_vhost_get_numa_node(int vid);
389
390/**
391 * Get the virtio net device's ifname, which is the vhost-user socket
392 * file path.
393 *
394 * @param vid
395 * vhost device ID
396 * @param buf
397 * The buffer to stored the queried ifname
398 * @param len
399 * The length of buf
400 *
401 * @return
402 * 0 on success, -1 on failure
403 */
404int rte_vhost_get_ifname(int vid, char *buf, size_t len);
405
406/**
407 * Get how many avail entries are left in the queue
408 *
409 * @param vid
410 * vhost device ID
411 * @param queue_id
412 * virtio queue index
413 *
414 * @return
415 * num of avail entires left
416 */
417uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id);
418
419struct rte_mbuf;
420struct rte_mempool;
421/**
422 * This function adds buffers to the virtio devices RX virtqueue. Buffers can
423 * be received from the physical port or from another virtual device. A packet
424 * count is returned to indicate the number of packets that were succesfully
425 * added to the RX queue.
426 * @param vid
427 * vhost device ID
428 * @param queue_id
429 * virtio queue index in mq case
430 * @param pkts
431 * array to contain packets to be enqueued
432 * @param count
433 * packets num to be enqueued
434 * @return
435 * num of packets enqueued
436 */
437uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
438 struct rte_mbuf **pkts, uint16_t count);
439
440/**
441 * This function gets guest buffers from the virtio device TX virtqueue,
442 * construct host mbufs, copies guest buffer content to host mbufs and
443 * store them in pkts to be processed.
444 * @param vid
445 * vhost device ID
446 * @param queue_id
447 * virtio queue index in mq case
448 * @param mbuf_pool
449 * mbuf_pool where host mbuf is allocated.
450 * @param pkts
451 * array to contain packets to be dequeued
452 * @param count
453 * packets num to be dequeued
454 * @return
455 * num of packets dequeued
456 */
457uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
458 struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count);
459
460/**
461 * Get guest mem table: a list of memory regions.
462 *
463 * An rte_vhost_vhost_memory object will be allocated internaly, to hold the
464 * guest memory regions. Application should free it at destroy_device()
465 * callback.
466 *
467 * @param vid
468 * vhost device ID
469 * @param mem
470 * To store the returned mem regions
471 * @return
472 * 0 on success, -1 on failure
473 */
474int rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem);
475
476/**
477 * Get guest vring info, including the vring address, vring size, etc.
478 *
479 * @param vid
480 * vhost device ID
481 * @param vring_idx
482 * vring index
483 * @param vring
484 * the structure to hold the requested vring info
485 * @return
486 * 0 on success, -1 on failure
487 */
488int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
489 struct rte_vhost_vring *vring);
490
491/**
492 * Set id of the last descriptors in avail and used guest vrings.
493 *
494 * In case user application operates directly on buffers, it should use this
495 * function on device destruction to retrieve the same values later on in device
496 * creation via rte_vhost_get_vhost_vring(int, uint16_t, struct rte_vhost_vring *)
497 *
498 * @param vid
499 * vhost device ID
500 * @param vring_idx
501 * vring index
502 * @param last_avail_idx
503 * id of the last descriptor in avail ring to be set
504 * @param last_used_idx
505 * id of the last descriptor in used ring to be set
506 * @return
507 * 0 on success, -1 on failure
508 */
9f95a23c
TL
509int rte_vhost_set_vring_base(int vid, uint16_t queue_id,
510 uint16_t last_avail_idx, uint16_t last_used_idx);
511
512int rte_vhost_get_vring_base(int vid, uint16_t queue_id,
513 uint16_t *last_avail_idx, uint16_t *last_used_idx);
514
515/**
516 * Notify the guest that used descriptors have been added to the vring.
517 *
518 * @param vid
519 * vhost device ID
520 * @param vring_idx
521 * vring index
522 * @return
523 * 0 on success, -1 on failure
524 */
525int rte_vhost_vring_call(int vid, uint16_t vring_idx);
11fdf7f2 526
f67539c2
TL
527/**
528 * Get guest inflight vring info, including inflight ring and resubmit list.
529 *
530 * @param vid
531 * vhost device ID
532 * @param vring_idx
533 * vring index
534 * @param vring
535 * the structure to hold the requested inflight vring info
536 * @return
537 * 0 on success, -1 on failure
538 */
539__rte_experimental
540int
541rte_vhost_get_vhost_ring_inflight(int vid, uint16_t vring_idx,
542 struct rte_vhost_ring_inflight *vring);
543
544/**
545 * Set split inflight descriptor.
546 *
547 * This function save descriptors that has been comsumed in available
548 * ring
549 *
550 * @param vid
551 * vhost device ID
552 * @param vring_idx
553 * vring index
554 * @param idx
555 * inflight entry index
556 * @return
557 * 0 on success, -1 on failure
558 */
559__rte_experimental
560int
561rte_vhost_set_inflight_desc_split(int vid, uint16_t vring_idx,
562 uint16_t idx);
563
564/**
565 * Save the head of list that the last batch of used descriptors.
566 *
567 * @param vid
568 * vhost device ID
569 * @param vring_idx
570 * vring index
571 * @param idx
572 * descriptor entry index
573 * @return
574 * 0 on success, -1 on failure
575 */
576__rte_experimental
577int
578rte_vhost_set_last_inflight_io_split(int vid,
579 uint16_t vring_idx, uint16_t idx);
580
581/**
582 * Clear the split inflight status.
583 *
584 * @param vid
585 * vhost device ID
586 * @param vring_idx
587 * vring index
588 * @param last_used_idx
589 * last used idx of used ring
590 * @param idx
591 * inflight entry index
592 * @return
593 * 0 on success, -1 on failure
594 */
595__rte_experimental
596int
597rte_vhost_clr_inflight_desc_split(int vid, uint16_t vring_idx,
598 uint16_t last_used_idx, uint16_t idx);
599
600/**
601 * Save the head of list that the last batch of used descriptors.
602 *
603 * @param vid
604 * vhost device ID
605 * @param vring_idx
606 * vring index
607 * @param idx
608 * descriptor entry index
609 * @return
610 * 0 on success, -1 on failure
611 */
612__rte_experimental
613int
614rte_vhost_set_last_inflight_io_split(int vid,
615 uint16_t vring_idx, uint16_t idx);
616
617/**
618 * Clear the split inflight status.
619 *
620 * @param vid
621 * vhost device ID
622 * @param vring_idx
623 * vring index
624 * @param last_used_idx
625 * last used idx of used ring
626 * @param idx
627 * inflight entry index
628 * @return
629 * 0 on success, -1 on failure
630 */
631__rte_experimental
632int
633rte_vhost_clr_inflight_desc_split(int vid, uint16_t vring_idx,
634 uint16_t last_used_idx, uint16_t idx);
11fdf7f2 635#endif /* _RTE_VHOST_H_ */