4 * Copyright (c) Intel Corporation.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 * Set of workarounds for rte_vhost to make it work with device types
36 * other than vhost-net.
39 #include "spdk/stdinc.h"
42 #include "spdk/likely.h"
43 #include "spdk/string.h"
44 #include "spdk/util.h"
45 #include "spdk/memory.h"
46 #include "spdk/barrier.h"
47 #include "spdk/vhost.h"
48 #include "vhost_internal.h"
50 #include "spdk_internal/vhost_user.h"
53 vhost_session_mem_region_calc(uint64_t *previous_start
, uint64_t *start
, uint64_t *end
,
54 uint64_t *len
, struct rte_vhost_mem_region
*region
)
56 *start
= FLOOR_2MB(region
->mmap_addr
);
57 *end
= CEIL_2MB(region
->mmap_addr
+ region
->mmap_size
);
58 if (*start
== *previous_start
) {
59 *start
+= (size_t) VALUE_2MB
;
61 *previous_start
= *start
;
66 vhost_session_mem_register(struct rte_vhost_memory
*mem
)
68 uint64_t start
, end
, len
;
70 uint64_t previous_start
= UINT64_MAX
;
73 for (i
= 0; i
< mem
->nregions
; i
++) {
74 vhost_session_mem_region_calc(&previous_start
, &start
, &end
, &len
, &mem
->regions
[i
]);
75 SPDK_INFOLOG(SPDK_LOG_VHOST
, "Registering VM memory for vtophys translation - 0x%jx len:0x%jx\n",
78 if (spdk_mem_register((void *)start
, len
) != 0) {
79 SPDK_WARNLOG("Failed to register memory region %"PRIu32
". Future vtophys translation might fail.\n",
87 vhost_session_mem_unregister(struct rte_vhost_memory
*mem
)
89 uint64_t start
, end
, len
;
91 uint64_t previous_start
= UINT64_MAX
;
93 for (i
= 0; i
< mem
->nregions
; i
++) {
94 vhost_session_mem_region_calc(&previous_start
, &start
, &end
, &len
, &mem
->regions
[i
]);
95 if (spdk_vtophys((void *) start
, NULL
) == SPDK_VTOPHYS_ERROR
) {
96 continue; /* region has not been registered */
99 if (spdk_mem_unregister((void *)start
, len
) != 0) {
106 new_connection(int vid
)
108 char ifname
[PATH_MAX
];
110 if (rte_vhost_get_ifname(vid
, ifname
, PATH_MAX
) < 0) {
111 SPDK_ERRLOG("Couldn't get a valid ifname for device with vid %d\n", vid
);
115 return vhost_new_connection_cb(vid
, ifname
);
119 start_device(int vid
)
121 return vhost_start_device_cb(vid
);
127 vhost_stop_device_cb(vid
);
131 destroy_connection(int vid
)
133 vhost_destroy_connection_cb(vid
);
136 static const struct vhost_device_ops g_spdk_vhost_ops
= {
137 .new_device
= start_device
,
138 .destroy_device
= stop_device
,
139 .new_connection
= new_connection
,
140 .destroy_connection
= destroy_connection
,
141 #ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
142 .get_config
= vhost_get_config_cb
,
143 .set_config
= vhost_set_config_cb
,
144 .vhost_nvme_admin_passthrough
= vhost_nvme_admin_passthrough
,
145 .vhost_nvme_set_cq_call
= vhost_nvme_set_cq_call
,
146 .vhost_nvme_get_cap
= vhost_nvme_get_cap
,
147 .vhost_nvme_set_bar_mr
= vhost_nvme_set_bar_mr
,
151 #ifndef SPDK_CONFIG_VHOST_INTERNAL_LIB
153 static enum rte_vhost_msg_result
154 extern_vhost_pre_msg_handler(int vid
, void *_msg
)
156 struct vhost_user_msg
*msg
= _msg
;
157 struct spdk_vhost_session
*vsession
;
159 vsession
= vhost_session_find_by_vid(vid
);
160 if (vsession
== NULL
) {
161 SPDK_ERRLOG("Received a message to unitialized session (vid %d).\n", vid
);
163 return RTE_VHOST_MSG_RESULT_ERR
;
166 switch (msg
->request
) {
167 case VHOST_USER_GET_VRING_BASE
:
168 if (vsession
->forced_polling
&& vsession
->started
) {
169 /* Our queue is stopped for whatever reason, but we may still
170 * need to poll it after it's initialized again.
172 g_spdk_vhost_ops
.destroy_device(vid
);
175 case VHOST_USER_SET_VRING_BASE
:
176 case VHOST_USER_SET_VRING_ADDR
:
177 case VHOST_USER_SET_VRING_NUM
:
178 case VHOST_USER_SET_VRING_KICK
:
179 if (vsession
->forced_polling
&& vsession
->started
) {
180 /* Additional queues are being initialized, so we either processed
181 * enough I/Os and are switching from SeaBIOS to the OS now, or
182 * we were never in SeaBIOS in the first place. Either way, we
183 * don't need our workaround anymore.
185 g_spdk_vhost_ops
.destroy_device(vid
);
186 vsession
->forced_polling
= false;
189 case VHOST_USER_SET_VRING_CALL
:
190 /* rte_vhost will close the previous callfd and won't notify
191 * us about any change. This will effectively make SPDK fail
192 * to deliver any subsequent interrupts until a session is
193 * restarted. We stop the session here before closing the previous
194 * fd (so that all interrupts must have been delivered by the
195 * time the descriptor is closed) and start right after (which
196 * will make SPDK retrieve the latest, up-to-date callfd from
199 case VHOST_USER_SET_MEM_TABLE
:
200 /* rte_vhost will unmap previous memory that SPDK may still
201 * have pending DMA operations on. We can't let that happen,
202 * so stop the device before letting rte_vhost unmap anything.
203 * This will block until all pending I/Os are finished.
204 * We will start the device again from the post-processing
207 if (vsession
->started
) {
208 g_spdk_vhost_ops
.destroy_device(vid
);
209 vsession
->needs_restart
= true;
212 case VHOST_USER_GET_CONFIG
: {
216 if (vsession
->vdev
->backend
->vhost_get_config
) {
217 rc
= vsession
->vdev
->backend
->vhost_get_config(vsession
->vdev
,
218 msg
->payload
.cfg
.region
, msg
->payload
.cfg
.size
);
225 return RTE_VHOST_MSG_RESULT_REPLY
;
227 case VHOST_USER_SET_CONFIG
: {
231 if (vsession
->vdev
->backend
->vhost_set_config
) {
232 rc
= vsession
->vdev
->backend
->vhost_set_config(vsession
->vdev
,
233 msg
->payload
.cfg
.region
, msg
->payload
.cfg
.offset
,
234 msg
->payload
.cfg
.size
, msg
->payload
.cfg
.flags
);
238 return rc
== 0 ? RTE_VHOST_MSG_RESULT_OK
: RTE_VHOST_MSG_RESULT_ERR
;
244 return RTE_VHOST_MSG_RESULT_NOT_HANDLED
;
247 static enum rte_vhost_msg_result
248 extern_vhost_post_msg_handler(int vid
, void *_msg
)
250 struct vhost_user_msg
*msg
= _msg
;
251 struct spdk_vhost_session
*vsession
;
253 vsession
= vhost_session_find_by_vid(vid
);
254 if (vsession
== NULL
) {
255 SPDK_ERRLOG("Received a message to unitialized session (vid %d).\n", vid
);
257 return RTE_VHOST_MSG_RESULT_ERR
;
260 if (vsession
->needs_restart
) {
261 g_spdk_vhost_ops
.new_device(vid
);
262 vsession
->needs_restart
= false;
263 return RTE_VHOST_MSG_RESULT_NOT_HANDLED
;
266 switch (msg
->request
) {
267 case VHOST_USER_SET_FEATURES
:
268 /* rte_vhost requires all queues to be fully initialized in order
269 * to start I/O processing. This behavior is not compliant with the
270 * vhost-user specification and doesn't work with QEMU 2.12+, which
271 * will only initialize 1 I/O queue for the SeaBIOS boot.
272 * Theoretically, we should start polling each virtqueue individually
273 * after receiving its SET_VRING_KICK message, but rte_vhost is not
274 * designed to poll individual queues. So here we use a workaround
275 * to detect when the vhost session could be potentially at that SeaBIOS
276 * stage and we mark it to start polling as soon as its first virtqueue
277 * gets initialized. This doesn't hurt any non-QEMU vhost slaves
278 * and allows QEMU 2.12+ to boot correctly. SET_FEATURES could be sent
279 * at any time, but QEMU will send it at least once on SeaBIOS
280 * initialization - whenever powered-up or rebooted.
282 vsession
->forced_polling
= true;
284 case VHOST_USER_SET_VRING_KICK
:
285 /* vhost-user spec tells us to start polling a queue after receiving
286 * its SET_VRING_KICK message. Let's do it!
288 if (vsession
->forced_polling
&& !vsession
->started
) {
289 g_spdk_vhost_ops
.new_device(vid
);
296 return RTE_VHOST_MSG_RESULT_NOT_HANDLED
;
299 struct rte_vhost_user_extern_ops g_spdk_extern_vhost_ops
= {
300 .pre_msg_handle
= extern_vhost_pre_msg_handler
,
301 .post_msg_handle
= extern_vhost_post_msg_handler
,
305 vhost_session_install_rte_compat_hooks(struct spdk_vhost_session
*vsession
)
309 rc
= rte_vhost_extern_callback_register(vsession
->vid
, &g_spdk_extern_vhost_ops
, NULL
);
311 SPDK_ERRLOG("rte_vhost_extern_callback_register() failed for vid = %d\n",
317 #else /* SPDK_CONFIG_VHOST_INTERNAL_LIB */
320 vhost_session_install_rte_compat_hooks(struct spdk_vhost_session
*vsession
)
322 /* nothing to do. all the changes are already incorporated into rte_vhost */
328 vhost_register_unix_socket(const char *path
, const char *ctrl_name
,
329 uint64_t virtio_features
, uint64_t disabled_features
, uint64_t protocol_features
)
331 struct stat file_stat
;
332 #ifndef SPDK_CONFIG_VHOST_INTERNAL_LIB
333 uint64_t features
= 0;
336 /* Register vhost driver to handle vhost messages. */
337 if (stat(path
, &file_stat
) != -1) {
338 if (!S_ISSOCK(file_stat
.st_mode
)) {
339 SPDK_ERRLOG("Cannot create a domain socket at path \"%s\": "
340 "The file already exists and is not a socket.\n",
343 } else if (unlink(path
) != 0) {
344 SPDK_ERRLOG("Cannot create a domain socket at path \"%s\": "
345 "The socket already exists and failed to unlink.\n",
351 if (rte_vhost_driver_register(path
, 0) != 0) {
352 SPDK_ERRLOG("Could not register controller %s with vhost library\n", ctrl_name
);
353 SPDK_ERRLOG("Check if domain socket %s already exists\n", path
);
356 if (rte_vhost_driver_set_features(path
, virtio_features
) ||
357 rte_vhost_driver_disable_features(path
, disabled_features
)) {
358 SPDK_ERRLOG("Couldn't set vhost features for controller %s\n", ctrl_name
);
360 rte_vhost_driver_unregister(path
);
364 if (rte_vhost_driver_callback_register(path
, &g_spdk_vhost_ops
) != 0) {
365 rte_vhost_driver_unregister(path
);
366 SPDK_ERRLOG("Couldn't register callbacks for controller %s\n", ctrl_name
);
370 #ifndef SPDK_CONFIG_VHOST_INTERNAL_LIB
371 rte_vhost_driver_get_protocol_features(path
, &features
);
372 features
|= protocol_features
;
373 rte_vhost_driver_set_protocol_features(path
, features
);
376 if (rte_vhost_driver_start(path
) != 0) {
377 SPDK_ERRLOG("Failed to start vhost driver for controller %s (%d): %s\n",
378 ctrl_name
, errno
, spdk_strerror(errno
));
379 rte_vhost_driver_unregister(path
);
387 vhost_get_mem_table(int vid
, struct rte_vhost_memory
**mem
)
389 return rte_vhost_get_mem_table(vid
, mem
);
393 vhost_driver_unregister(const char *path
)
395 return rte_vhost_driver_unregister(path
);
399 vhost_get_negotiated_features(int vid
, uint64_t *negotiated_features
)
401 return rte_vhost_get_negotiated_features(vid
, negotiated_features
);