]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/lib/vhost/vhost.c
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / spdk / lib / vhost / vhost.c
1 /*-
2 * BSD LICENSE
3 *
4 * Copyright(c) Intel Corporation. All rights reserved.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #include "spdk/stdinc.h"
35
36 #include "spdk/env.h"
37 #include "spdk/likely.h"
38 #include "spdk/string.h"
39 #include "spdk/util.h"
40 #include "spdk/barrier.h"
41
42 #include "spdk/vhost.h"
43 #include "vhost_internal.h"
44
45 static uint32_t *g_num_ctrlrs;
46
47 /* Path to folder where character device will be created. Can be set by user. */
48 static char dev_dirname[PATH_MAX] = "";
49
50 struct spdk_vhost_dev_event_ctx {
51 /** Pointer to the controller obtained before enqueuing the event */
52 struct spdk_vhost_dev *vdev;
53
54 /** ID of the vdev to send event to. */
55 unsigned vdev_id;
56
57 /** User callback function to be executed on given lcore. */
58 spdk_vhost_event_fn cb_fn;
59
60 /** Semaphore used to signal that event is done. */
61 sem_t sem;
62
63 /** Response to be written by enqueued event. */
64 int response;
65 };
66
67 static int new_connection(int vid);
68 static int start_device(int vid);
69 static void stop_device(int vid);
70 static void destroy_connection(int vid);
71 static int get_config(int vid, uint8_t *config, uint32_t len);
72 static int set_config(int vid, uint8_t *config, uint32_t offset,
73 uint32_t size, uint32_t flags);
74
75 const struct vhost_device_ops g_spdk_vhost_ops = {
76 .new_device = start_device,
77 .destroy_device = stop_device,
78 .get_config = get_config,
79 .set_config = set_config,
80 .new_connection = new_connection,
81 .destroy_connection = destroy_connection,
82 .vhost_nvme_admin_passthrough = spdk_vhost_nvme_admin_passthrough,
83 .vhost_nvme_set_cq_call = spdk_vhost_nvme_set_cq_call,
84 .vhost_nvme_get_cap = spdk_vhost_nvme_get_cap,
85 };
86
87 static TAILQ_HEAD(, spdk_vhost_dev) g_spdk_vhost_devices = TAILQ_HEAD_INITIALIZER(
88 g_spdk_vhost_devices);
89 static pthread_mutex_t g_spdk_vhost_mutex = PTHREAD_MUTEX_INITIALIZER;
90
91 void *spdk_vhost_gpa_to_vva(struct spdk_vhost_dev *vdev, uint64_t addr, uint64_t len)
92 {
93 void *vva;
94 uint64_t newlen;
95
96 newlen = len;
97 vva = (void *)rte_vhost_va_from_guest_pa(vdev->mem, addr, &newlen);
98 if (newlen != len) {
99 return NULL;
100 }
101
102 return vva;
103
104 }
105
106 static void
107 spdk_vhost_log_req_desc(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *virtqueue,
108 uint16_t req_id)
109 {
110 struct vring_desc *desc, *desc_table;
111 uint32_t desc_table_size;
112 int rc;
113
114 if (spdk_likely(!spdk_vhost_dev_has_feature(vdev, VHOST_F_LOG_ALL))) {
115 return;
116 }
117
118 rc = spdk_vhost_vq_get_desc(vdev, virtqueue, req_id, &desc, &desc_table, &desc_table_size);
119 if (spdk_unlikely(rc != 0)) {
120 SPDK_ERRLOG("Can't log used ring descriptors!\n");
121 return;
122 }
123
124 do {
125 if (spdk_vhost_vring_desc_is_wr(desc)) {
126 /* To be honest, only pages realy touched should be logged, but
127 * doing so would require tracking those changes in each backed.
128 * Also backend most likely will touch all/most of those pages so
129 * for lets assume we touched all pages passed to as writeable buffers. */
130 rte_vhost_log_write(vdev->vid, desc->addr, desc->len);
131 }
132 spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
133 } while (desc);
134 }
135
136 static void
137 spdk_vhost_log_used_vring_elem(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *virtqueue,
138 uint16_t idx)
139 {
140 uint64_t offset, len;
141 uint16_t vq_idx;
142
143 if (spdk_likely(!spdk_vhost_dev_has_feature(vdev, VHOST_F_LOG_ALL))) {
144 return;
145 }
146
147 offset = offsetof(struct vring_used, ring[idx]);
148 len = sizeof(virtqueue->vring.used->ring[idx]);
149 vq_idx = virtqueue - vdev->virtqueue;
150
151 rte_vhost_log_used_vring(vdev->vid, vq_idx, offset, len);
152 }
153
154 static void
155 spdk_vhost_log_used_vring_idx(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *virtqueue)
156 {
157 uint64_t offset, len;
158 uint16_t vq_idx;
159
160 if (spdk_likely(!spdk_vhost_dev_has_feature(vdev, VHOST_F_LOG_ALL))) {
161 return;
162 }
163
164 offset = offsetof(struct vring_used, idx);
165 len = sizeof(virtqueue->vring.used->idx);
166 vq_idx = virtqueue - vdev->virtqueue;
167
168 rte_vhost_log_used_vring(vdev->vid, vq_idx, offset, len);
169 }
170
171 /*
172 * Get available requests from avail ring.
173 */
174 uint16_t
175 spdk_vhost_vq_avail_ring_get(struct spdk_vhost_virtqueue *virtqueue, uint16_t *reqs,
176 uint16_t reqs_len)
177 {
178 struct rte_vhost_vring *vring = &virtqueue->vring;
179 struct vring_avail *avail = vring->avail;
180 uint16_t size_mask = vring->size - 1;
181 uint16_t last_idx = vring->last_avail_idx, avail_idx = avail->idx;
182 uint16_t count, i;
183
184 count = avail_idx - last_idx;
185 if (spdk_likely(count == 0)) {
186 return 0;
187 }
188
189 if (spdk_unlikely(count > vring->size)) {
190 /* TODO: the queue is unrecoverably broken and should be marked so.
191 * For now we will fail silently and report there are no new avail entries.
192 */
193 return 0;
194 }
195
196 count = spdk_min(count, reqs_len);
197 vring->last_avail_idx += count;
198 for (i = 0; i < count; i++) {
199 reqs[i] = vring->avail->ring[(last_idx + i) & size_mask];
200 }
201
202 SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING,
203 "AVAIL: last_idx=%"PRIu16" avail_idx=%"PRIu16" count=%"PRIu16"\n",
204 last_idx, avail_idx, count);
205
206 return count;
207 }
208
209 static bool
210 spdk_vhost_vring_desc_is_indirect(struct vring_desc *cur_desc)
211 {
212 return !!(cur_desc->flags & VRING_DESC_F_INDIRECT);
213 }
214
215 int
216 spdk_vhost_vq_get_desc(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *virtqueue,
217 uint16_t req_idx, struct vring_desc **desc, struct vring_desc **desc_table,
218 uint32_t *desc_table_size)
219 {
220 if (spdk_unlikely(req_idx >= virtqueue->vring.size)) {
221 return -1;
222 }
223
224 *desc = &virtqueue->vring.desc[req_idx];
225
226 if (spdk_vhost_vring_desc_is_indirect(*desc)) {
227 assert(spdk_vhost_dev_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC));
228 *desc_table_size = (*desc)->len / sizeof(**desc);
229 *desc_table = spdk_vhost_gpa_to_vva(vdev, (*desc)->addr,
230 sizeof(**desc) * *desc_table_size);
231 *desc = *desc_table;
232 if (*desc == NULL) {
233 return -1;
234 }
235
236 return 0;
237 }
238
239 *desc_table = virtqueue->vring.desc;
240 *desc_table_size = virtqueue->vring.size;
241
242 return 0;
243 }
244
245 int
246 spdk_vhost_vq_used_signal(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *virtqueue)
247 {
248 if (virtqueue->used_req_cnt == 0) {
249 return 0;
250 }
251
252 virtqueue->req_cnt += virtqueue->used_req_cnt;
253 virtqueue->used_req_cnt = 0;
254
255 SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING,
256 "Queue %td - USED RING: sending IRQ: last used %"PRIu16"\n",
257 virtqueue - vdev->virtqueue, virtqueue->vring.last_used_idx);
258
259 eventfd_write(virtqueue->vring.callfd, (eventfd_t)1);
260 return 1;
261 }
262
263
264 static void
265 check_dev_io_stats(struct spdk_vhost_dev *vdev, uint64_t now)
266 {
267 struct spdk_vhost_virtqueue *virtqueue;
268 uint32_t irq_delay_base = vdev->coalescing_delay_time_base;
269 uint32_t io_threshold = vdev->coalescing_io_rate_threshold;
270 int32_t irq_delay;
271 uint32_t req_cnt;
272 uint16_t q_idx;
273
274 if (now < vdev->next_stats_check_time) {
275 return;
276 }
277
278 vdev->next_stats_check_time = now + vdev->stats_check_interval;
279 for (q_idx = 0; q_idx < vdev->max_queues; q_idx++) {
280 virtqueue = &vdev->virtqueue[q_idx];
281
282 req_cnt = virtqueue->req_cnt + virtqueue->used_req_cnt;
283 if (req_cnt <= io_threshold) {
284 continue;
285 }
286
287 irq_delay = (irq_delay_base * (req_cnt - io_threshold)) / io_threshold;
288 virtqueue->irq_delay_time = (uint32_t) spdk_max(0, irq_delay);
289
290 virtqueue->req_cnt = 0;
291 virtqueue->next_event_time = now;
292 }
293 }
294
295 void
296 spdk_vhost_dev_used_signal(struct spdk_vhost_dev *vdev)
297 {
298 struct spdk_vhost_virtqueue *virtqueue;
299 uint64_t now;
300 uint16_t q_idx;
301
302 if (vdev->coalescing_delay_time_base == 0) {
303 for (q_idx = 0; q_idx < vdev->max_queues; q_idx++) {
304 virtqueue = &vdev->virtqueue[q_idx];
305
306 if (virtqueue->vring.desc == NULL ||
307 (virtqueue->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
308 continue;
309 }
310
311 spdk_vhost_vq_used_signal(vdev, virtqueue);
312 }
313 } else {
314 now = spdk_get_ticks();
315 check_dev_io_stats(vdev, now);
316
317 for (q_idx = 0; q_idx < vdev->max_queues; q_idx++) {
318 virtqueue = &vdev->virtqueue[q_idx];
319
320 /* No need for event right now */
321 if (now < virtqueue->next_event_time ||
322 (virtqueue->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
323 continue;
324 }
325
326 if (!spdk_vhost_vq_used_signal(vdev, virtqueue)) {
327 continue;
328 }
329
330 /* Syscall is quite long so update time */
331 now = spdk_get_ticks();
332 virtqueue->next_event_time = now + virtqueue->irq_delay_time;
333 }
334 }
335 }
336
337 int
338 spdk_vhost_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us,
339 uint32_t iops_threshold)
340 {
341 uint64_t delay_time_base = delay_base_us * spdk_get_ticks_hz() / 1000000ULL;
342 uint32_t io_rate = iops_threshold * SPDK_VHOST_DEV_STATS_CHECK_INTERVAL_MS / 1000U;
343
344 if (delay_time_base >= UINT32_MAX) {
345 SPDK_ERRLOG("Delay time of %"PRIu32" is to big\n", delay_base_us);
346 return -EINVAL;
347 } else if (io_rate == 0) {
348 SPDK_ERRLOG("IOPS rate of %"PRIu32" is too low. Min is %u\n", io_rate,
349 1000U / SPDK_VHOST_DEV_STATS_CHECK_INTERVAL_MS);
350 return -EINVAL;
351 }
352
353 vdev->coalescing_delay_time_base = delay_time_base;
354 vdev->coalescing_io_rate_threshold = io_rate;
355
356 vdev->coalescing_delay_us = delay_base_us;
357 vdev->coalescing_iops_threshold = iops_threshold;
358 return 0;
359 }
360
361 void
362 spdk_vhost_get_coalescing(struct spdk_vhost_dev *vdev, uint32_t *delay_base_us,
363 uint32_t *iops_threshold)
364 {
365 if (delay_base_us) {
366 *delay_base_us = vdev->coalescing_delay_us;
367 }
368
369 if (iops_threshold) {
370 *iops_threshold = vdev->coalescing_iops_threshold;
371 }
372 }
373
374 /*
375 * Enqueue id and len to used ring.
376 */
377 void
378 spdk_vhost_vq_used_ring_enqueue(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *virtqueue,
379 uint16_t id, uint32_t len)
380 {
381 struct rte_vhost_vring *vring = &virtqueue->vring;
382 struct vring_used *used = vring->used;
383 uint16_t last_idx = vring->last_used_idx & (vring->size - 1);
384
385 SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING,
386 "Queue %td - USED RING: last_idx=%"PRIu16" req id=%"PRIu16" len=%"PRIu32"\n",
387 virtqueue - vdev->virtqueue, vring->last_used_idx, id, len);
388
389 spdk_vhost_log_req_desc(vdev, virtqueue, id);
390
391 vring->last_used_idx++;
392 used->ring[last_idx].id = id;
393 used->ring[last_idx].len = len;
394
395 /* Ensure the used ring is updated before we log it or increment used->idx. */
396 spdk_smp_wmb();
397
398 spdk_vhost_log_used_vring_elem(vdev, virtqueue, last_idx);
399 * (volatile uint16_t *) &used->idx = vring->last_used_idx;
400 spdk_vhost_log_used_vring_idx(vdev, virtqueue);
401
402 /* Ensure all our used ring changes are visible to the guest at the time
403 * of interrupt.
404 * TODO: this is currently an sfence on x86. For other architectures we
405 * will most likely need an smp_mb(), but smp_mb() is an overkill for x86.
406 */
407 spdk_wmb();
408
409 virtqueue->used_req_cnt++;
410 }
411
412 int
413 spdk_vhost_vring_desc_get_next(struct vring_desc **desc,
414 struct vring_desc *desc_table, uint32_t desc_table_size)
415 {
416 struct vring_desc *old_desc = *desc;
417 uint16_t next_idx;
418
419 if ((old_desc->flags & VRING_DESC_F_NEXT) == 0) {
420 *desc = NULL;
421 return 0;
422 }
423
424 next_idx = old_desc->next;
425 if (spdk_unlikely(next_idx >= desc_table_size)) {
426 *desc = NULL;
427 return -1;
428 }
429
430 *desc = &desc_table[next_idx];
431 return 0;
432 }
433
434 bool
435 spdk_vhost_vring_desc_is_wr(struct vring_desc *cur_desc)
436 {
437 return !!(cur_desc->flags & VRING_DESC_F_WRITE);
438 }
439
440 #define _2MB_OFFSET(ptr) ((ptr) & (0x200000 - 1))
441
442 int
443 spdk_vhost_vring_desc_to_iov(struct spdk_vhost_dev *vdev, struct iovec *iov,
444 uint16_t *iov_index, const struct vring_desc *desc)
445 {
446 uint32_t remaining = desc->len;
447 uint32_t to_boundary;
448 uint32_t len;
449 uintptr_t payload = desc->addr;
450 uintptr_t vva;
451
452 while (remaining) {
453 if (*iov_index >= SPDK_VHOST_IOVS_MAX) {
454 SPDK_ERRLOG("SPDK_VHOST_IOVS_MAX(%d) reached\n", SPDK_VHOST_IOVS_MAX);
455 return -1;
456 }
457 vva = (uintptr_t)rte_vhost_gpa_to_vva(vdev->mem, payload);
458 if (vva == 0) {
459 SPDK_ERRLOG("gpa_to_vva(%p) == NULL\n", (void *)payload);
460 return -1;
461 }
462 to_boundary = 0x200000 - _2MB_OFFSET(payload);
463 if (spdk_likely(remaining <= to_boundary)) {
464 len = remaining;
465 } else {
466 /*
467 * Descriptor crosses a 2MB hugepage boundary. vhost memory regions are allocated
468 * from hugepage memory, so this means this descriptor may be described by
469 * discontiguous vhost memory regions. Do not blindly split on the 2MB boundary,
470 * only split it if the two sides of the boundary do not map to the same vhost
471 * memory region. This helps ensure we do not exceed the max number of IOVs
472 * defined by SPDK_VHOST_IOVS_MAX.
473 */
474 len = to_boundary;
475 while (len < remaining) {
476 if (vva + len != (uintptr_t)rte_vhost_gpa_to_vva(vdev->mem, payload + len)) {
477 break;
478 }
479 len += spdk_min(remaining - len, 0x200000);
480 }
481 }
482 iov[*iov_index].iov_base = (void *)vva;
483 iov[*iov_index].iov_len = len;
484 remaining -= len;
485 payload += len;
486 (*iov_index)++;
487 }
488
489 return 0;
490 }
491
492 static struct spdk_vhost_dev *
493 spdk_vhost_dev_find_by_id(unsigned id)
494 {
495 struct spdk_vhost_dev *vdev;
496
497 TAILQ_FOREACH(vdev, &g_spdk_vhost_devices, tailq) {
498 if (vdev->id == id) {
499 return vdev;
500 }
501 }
502
503 return NULL;
504 }
505
506 static struct spdk_vhost_dev *
507 spdk_vhost_dev_find_by_vid(int vid)
508 {
509 struct spdk_vhost_dev *vdev;
510
511 TAILQ_FOREACH(vdev, &g_spdk_vhost_devices, tailq) {
512 if (vdev->vid == vid) {
513 return vdev;
514 }
515 }
516
517 return NULL;
518 }
519
520 #define SHIFT_2MB 21
521 #define SIZE_2MB (1ULL << SHIFT_2MB)
522 #define FLOOR_2MB(x) (((uintptr_t)x) / SIZE_2MB) << SHIFT_2MB
523 #define CEIL_2MB(x) ((((uintptr_t)x) + SIZE_2MB - 1) / SIZE_2MB) << SHIFT_2MB
524
525 static void
526 spdk_vhost_dev_mem_register(struct spdk_vhost_dev *vdev)
527 {
528 struct rte_vhost_mem_region *region;
529 uint32_t i;
530
531 for (i = 0; i < vdev->mem->nregions; i++) {
532 uint64_t start, end, len;
533 region = &vdev->mem->regions[i];
534 start = FLOOR_2MB(region->mmap_addr);
535 end = CEIL_2MB(region->mmap_addr + region->mmap_size);
536 len = end - start;
537 SPDK_INFOLOG(SPDK_LOG_VHOST, "Registering VM memory for vtophys translation - 0x%jx len:0x%jx\n",
538 start, len);
539
540 if (spdk_mem_register((void *)start, len) != 0) {
541 SPDK_WARNLOG("Failed to register memory region %"PRIu32". Future vtophys translation might fail.\n",
542 i);
543 continue;
544 }
545 }
546 }
547
548 static void
549 spdk_vhost_dev_mem_unregister(struct spdk_vhost_dev *vdev)
550 {
551 struct rte_vhost_mem_region *region;
552 uint32_t i;
553
554 for (i = 0; i < vdev->mem->nregions; i++) {
555 uint64_t start, end, len;
556 region = &vdev->mem->regions[i];
557 start = FLOOR_2MB(region->mmap_addr);
558 end = CEIL_2MB(region->mmap_addr + region->mmap_size);
559 len = end - start;
560
561 if (spdk_vtophys((void *) start) == SPDK_VTOPHYS_ERROR) {
562 continue; /* region has not been registered */
563 }
564
565 if (spdk_mem_unregister((void *)start, len) != 0) {
566 assert(false);
567 }
568 }
569
570 }
571
572 static void
573 spdk_vhost_free_reactor(uint32_t lcore)
574 {
575 g_num_ctrlrs[lcore]--;
576 }
577
578 struct spdk_vhost_dev *
579 spdk_vhost_dev_find(const char *ctrlr_name)
580 {
581 struct spdk_vhost_dev *vdev;
582 size_t dev_dirname_len = strlen(dev_dirname);
583
584 if (strncmp(ctrlr_name, dev_dirname, dev_dirname_len) == 0) {
585 ctrlr_name += dev_dirname_len;
586 }
587
588 TAILQ_FOREACH(vdev, &g_spdk_vhost_devices, tailq) {
589 if (strcmp(vdev->name, ctrlr_name) == 0) {
590 return vdev;
591 }
592 }
593
594 return NULL;
595 }
596
597 static int
598 spdk_vhost_parse_core_mask(const char *mask, struct spdk_cpuset *cpumask)
599 {
600 int rc;
601
602 if (cpumask == NULL) {
603 return -1;
604 }
605
606 if (mask == NULL) {
607 spdk_cpuset_copy(cpumask, spdk_app_get_core_mask());
608 return 0;
609 }
610
611 rc = spdk_app_parse_core_mask(mask, cpumask);
612 if (rc < 0) {
613 SPDK_ERRLOG("invalid cpumask %s\n", mask);
614 return -1;
615 }
616
617 if (spdk_cpuset_count(cpumask) == 0) {
618 SPDK_ERRLOG("no cpu is selected among reactor mask(=%s)\n",
619 spdk_cpuset_fmt(spdk_app_get_core_mask()));
620 return -1;
621 }
622
623 return 0;
624 }
625
626 static void *
627 _start_rte_driver(void *arg)
628 {
629 char *path = arg;
630
631 if (rte_vhost_driver_start(path) != 0) {
632 return NULL;
633 }
634
635 return path;
636 }
637
638 int
639 spdk_vhost_dev_register(struct spdk_vhost_dev *vdev, const char *name, const char *mask_str,
640 const struct spdk_vhost_dev_backend *backend)
641 {
642 static unsigned ctrlr_num;
643 char path[PATH_MAX];
644 struct stat file_stat;
645 struct spdk_cpuset *cpumask;
646 int rc;
647
648 assert(vdev);
649
650 /* We expect devices inside g_spdk_vhost_devices to be sorted in ascending
651 * order in regard of vdev->id. For now we always set vdev->id = ctrlr_num++
652 * and append each vdev to the very end of g_spdk_vhost_devices list.
653 * This is required for foreach vhost events to work.
654 */
655 if (ctrlr_num == UINT_MAX) {
656 assert(false);
657 return -EINVAL;
658 }
659
660 if (name == NULL) {
661 SPDK_ERRLOG("Can't register controller with no name\n");
662 return -EINVAL;
663 }
664
665 cpumask = spdk_cpuset_alloc();
666 if (!cpumask) {
667 SPDK_ERRLOG("spdk_cpuset_alloc failed\n");
668 return -ENOMEM;
669 }
670
671 if (spdk_vhost_parse_core_mask(mask_str, cpumask) != 0) {
672 SPDK_ERRLOG("cpumask %s is invalid (app mask is 0x%s)\n",
673 mask_str, spdk_cpuset_fmt(spdk_app_get_core_mask()));
674 rc = -EINVAL;
675 goto out;
676 }
677
678 if (spdk_vhost_dev_find(name)) {
679 SPDK_ERRLOG("vhost controller %s already exists.\n", name);
680 rc = -EEXIST;
681 goto out;
682 }
683
684 if (snprintf(path, sizeof(path), "%s%s", dev_dirname, name) >= (int)sizeof(path)) {
685 SPDK_ERRLOG("Resulting socket path for controller %s is too long: %s%s\n", name, dev_dirname,
686 name);
687 rc = -EINVAL;
688 goto out;
689 }
690
691 /* Register vhost driver to handle vhost messages. */
692 if (stat(path, &file_stat) != -1) {
693 if (!S_ISSOCK(file_stat.st_mode)) {
694 SPDK_ERRLOG("Cannot create a domain socket at path \"%s\": "
695 "The file already exists and is not a socket.\n",
696 path);
697 rc = -EIO;
698 goto out;
699 } else if (unlink(path) != 0) {
700 SPDK_ERRLOG("Cannot create a domain socket at path \"%s\": "
701 "The socket already exists and failed to unlink.\n",
702 path);
703 rc = -EIO;
704 goto out;
705 }
706 }
707
708 if (rte_vhost_driver_register(path, 0) != 0) {
709 SPDK_ERRLOG("Could not register controller %s with vhost library\n", name);
710 SPDK_ERRLOG("Check if domain socket %s already exists\n", path);
711 rc = -EIO;
712 goto out;
713 }
714 if (rte_vhost_driver_set_features(path, backend->virtio_features) ||
715 rte_vhost_driver_disable_features(path, backend->disabled_features)) {
716 SPDK_ERRLOG("Couldn't set vhost features for controller %s\n", name);
717
718 rte_vhost_driver_unregister(path);
719 rc = -EIO;
720 goto out;
721 }
722
723 if (rte_vhost_driver_callback_register(path, &g_spdk_vhost_ops) != 0) {
724 rte_vhost_driver_unregister(path);
725 SPDK_ERRLOG("Couldn't register callbacks for controller %s\n", name);
726 rc = -EIO;
727 goto out;
728 }
729
730 /* The following might start a POSIX thread that polls for incoming
731 * socket connections and calls backend->start/stop_device. These backend
732 * callbacks are also protected by the global SPDK vhost mutex, so we're
733 * safe with not initializing the vdev just yet.
734 */
735 if (spdk_call_unaffinitized(_start_rte_driver, path) == NULL) {
736 SPDK_ERRLOG("Failed to start vhost driver for controller %s (%d): %s\n",
737 name, errno, spdk_strerror(errno));
738 rte_vhost_driver_unregister(path);
739 rc = -EIO;
740 goto out;
741 }
742
743 vdev->name = strdup(name);
744 vdev->path = strdup(path);
745 vdev->id = ctrlr_num++;
746 vdev->vid = -1;
747 vdev->lcore = -1;
748 vdev->cpumask = cpumask;
749 vdev->registered = true;
750 vdev->backend = backend;
751
752 spdk_vhost_set_coalescing(vdev, SPDK_VHOST_COALESCING_DELAY_BASE_US,
753 SPDK_VHOST_VQ_IOPS_COALESCING_THRESHOLD);
754 vdev->next_stats_check_time = 0;
755 vdev->stats_check_interval = SPDK_VHOST_DEV_STATS_CHECK_INTERVAL_MS * spdk_get_ticks_hz() /
756 1000UL;
757
758 TAILQ_INSERT_TAIL(&g_spdk_vhost_devices, vdev, tailq);
759
760 SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: new controller added\n", vdev->name);
761 return 0;
762
763 out:
764 spdk_cpuset_free(cpumask);
765 return rc;
766 }
767
768 int
769 spdk_vhost_dev_unregister(struct spdk_vhost_dev *vdev)
770 {
771 if (vdev->vid != -1) {
772 SPDK_ERRLOG("Controller %s has still valid connection.\n", vdev->name);
773 return -EBUSY;
774 }
775
776 if (vdev->registered && rte_vhost_driver_unregister(vdev->path) != 0) {
777 SPDK_ERRLOG("Could not unregister controller %s with vhost library\n"
778 "Check if domain socket %s still exists\n",
779 vdev->name, vdev->path);
780 return -EIO;
781 }
782
783 SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: removed\n", vdev->name);
784
785 free(vdev->name);
786 free(vdev->path);
787 spdk_cpuset_free(vdev->cpumask);
788 TAILQ_REMOVE(&g_spdk_vhost_devices, vdev, tailq);
789 return 0;
790 }
791
792 static struct spdk_vhost_dev *
793 spdk_vhost_dev_next(unsigned i)
794 {
795 struct spdk_vhost_dev *vdev;
796
797 TAILQ_FOREACH(vdev, &g_spdk_vhost_devices, tailq) {
798 if (vdev->id > i) {
799 return vdev;
800 }
801 }
802
803 return NULL;
804 }
805
806 const char *
807 spdk_vhost_dev_get_name(struct spdk_vhost_dev *vdev)
808 {
809 assert(vdev != NULL);
810 return vdev->name;
811 }
812
813 const struct spdk_cpuset *
814 spdk_vhost_dev_get_cpumask(struct spdk_vhost_dev *vdev)
815 {
816 assert(vdev != NULL);
817 return vdev->cpumask;
818 }
819
820 static uint32_t
821 spdk_vhost_allocate_reactor(struct spdk_cpuset *cpumask)
822 {
823 uint32_t i, selected_core;
824 uint32_t min_ctrlrs;
825
826 min_ctrlrs = INT_MAX;
827 selected_core = spdk_env_get_first_core();
828
829 SPDK_ENV_FOREACH_CORE(i) {
830 if (!spdk_cpuset_get_cpu(cpumask, i)) {
831 continue;
832 }
833
834 if (g_num_ctrlrs[i] < min_ctrlrs) {
835 selected_core = i;
836 min_ctrlrs = g_num_ctrlrs[i];
837 }
838 }
839
840 g_num_ctrlrs[selected_core]++;
841 return selected_core;
842 }
843
844 void
845 spdk_vhost_dev_backend_event_done(void *event_ctx, int response)
846 {
847 struct spdk_vhost_dev_event_ctx *ctx = event_ctx;
848
849 ctx->response = response;
850 sem_post(&ctx->sem);
851 }
852
853 static void
854 spdk_vhost_event_cb(void *arg1, void *arg2)
855 {
856 struct spdk_vhost_dev_event_ctx *ctx = arg1;
857
858 ctx->cb_fn(ctx->vdev, ctx);
859 }
860
861 static void
862 spdk_vhost_event_async_fn(void *arg1, void *arg2)
863 {
864 struct spdk_vhost_dev_event_ctx *ctx = arg1;
865 struct spdk_vhost_dev *vdev;
866 struct spdk_event *ev;
867
868 if (pthread_mutex_trylock(&g_spdk_vhost_mutex) != 0) {
869 ev = spdk_event_allocate(spdk_env_get_current_core(), spdk_vhost_event_async_fn, arg1, arg2);
870 spdk_event_call(ev);
871 return;
872 }
873
874 vdev = spdk_vhost_dev_find_by_id(ctx->vdev_id);
875 if (vdev != ctx->vdev) {
876 /* vdev has been changed after enqueuing this event */
877 vdev = NULL;
878 }
879
880 if (vdev != NULL && vdev->lcore >= 0 &&
881 (uint32_t)vdev->lcore != spdk_env_get_current_core()) {
882 /* if vdev has been relocated to other core, it is no longer thread-safe
883 * to access its contents here. Even though we're running under global vhost
884 * mutex, the controller itself (and its pollers) are not. We need to chase
885 * the vdev thread as many times as necessary.
886 */
887 ev = spdk_event_allocate(vdev->lcore, spdk_vhost_event_async_fn, arg1, arg2);
888 spdk_event_call(ev);
889 pthread_mutex_unlock(&g_spdk_vhost_mutex);
890 return;
891 }
892
893 ctx->cb_fn(vdev, arg2);
894 pthread_mutex_unlock(&g_spdk_vhost_mutex);
895
896 free(ctx);
897 }
898
899 static void spdk_vhost_external_event_foreach_continue(struct spdk_vhost_dev *vdev,
900 spdk_vhost_event_fn fn, void *arg);
901
902 static void
903 spdk_vhost_event_async_foreach_fn(void *arg1, void *arg2)
904 {
905 struct spdk_vhost_dev_event_ctx *ctx = arg1;
906 struct spdk_vhost_dev *vdev;
907 struct spdk_event *ev;
908
909 if (pthread_mutex_trylock(&g_spdk_vhost_mutex) != 0) {
910 ev = spdk_event_allocate(spdk_env_get_current_core(),
911 spdk_vhost_event_async_foreach_fn, arg1, arg2);
912 spdk_event_call(ev);
913 return;
914 }
915
916 vdev = spdk_vhost_dev_find_by_id(ctx->vdev_id);
917 if (vdev != ctx->vdev) {
918 /* ctx->vdev is probably a dangling pointer at this point.
919 * It must have been removed in the meantime, so we just skip
920 * it in our foreach chain. */
921 goto out_unlock_continue;
922 }
923
924 /* the assert is just for static analyzers, vdev cannot be NULL here */
925 assert(vdev != NULL);
926 if (vdev->lcore >= 0 &&
927 (uint32_t)vdev->lcore != spdk_env_get_current_core()) {
928 /* if vdev has been relocated to other core, it is no longer thread-safe
929 * to access its contents here. Even though we're running under global vhost
930 * mutex, the controller itself (and its pollers) are not. We need to chase
931 * the vdev thread as many times as necessary.
932 */
933 ev = spdk_event_allocate(vdev->lcore,
934 spdk_vhost_event_async_foreach_fn, arg1, arg2);
935 spdk_event_call(ev);
936 pthread_mutex_unlock(&g_spdk_vhost_mutex);
937 return;
938 }
939
940 ctx->cb_fn(vdev, arg2);
941
942 out_unlock_continue:
943 vdev = spdk_vhost_dev_next(ctx->vdev_id);
944 spdk_vhost_external_event_foreach_continue(vdev, ctx->cb_fn, arg2);
945 pthread_mutex_unlock(&g_spdk_vhost_mutex);
946
947 free(ctx);
948 }
949
950 static int
951 _spdk_vhost_event_send(struct spdk_vhost_dev *vdev, spdk_vhost_event_fn cb_fn,
952 unsigned timeout_sec, const char *errmsg)
953 {
954 struct spdk_vhost_dev_event_ctx ev_ctx = {0};
955 struct spdk_event *ev;
956 struct timespec timeout;
957 int rc;
958
959 rc = sem_init(&ev_ctx.sem, 0, 0);
960 if (rc != 0) {
961 SPDK_ERRLOG("Failed to initialize semaphore for vhost timed event\n");
962 return -errno;
963 }
964
965 ev_ctx.vdev = vdev;
966 ev_ctx.cb_fn = cb_fn;
967 ev = spdk_event_allocate(vdev->lcore, spdk_vhost_event_cb, &ev_ctx, NULL);
968 assert(ev);
969 spdk_event_call(ev);
970 pthread_mutex_unlock(&g_spdk_vhost_mutex);
971
972 clock_gettime(CLOCK_REALTIME, &timeout);
973 timeout.tv_sec += timeout_sec;
974
975 rc = sem_timedwait(&ev_ctx.sem, &timeout);
976 if (rc != 0) {
977 SPDK_ERRLOG("Timeout waiting for event: %s.\n", errmsg);
978 sem_wait(&ev_ctx.sem);
979 }
980
981 sem_destroy(&ev_ctx.sem);
982 pthread_mutex_lock(&g_spdk_vhost_mutex);
983 return ev_ctx.response;
984 }
985
986 static int
987 spdk_vhost_event_async_send(struct spdk_vhost_dev *vdev, spdk_vhost_event_fn cb_fn, void *arg,
988 bool foreach)
989 {
990 struct spdk_vhost_dev_event_ctx *ev_ctx;
991 struct spdk_event *ev;
992 spdk_event_fn fn;
993
994 ev_ctx = calloc(1, sizeof(*ev_ctx));
995 if (ev_ctx == NULL) {
996 SPDK_ERRLOG("Failed to alloc vhost event.\n");
997 assert(false);
998 return -ENOMEM;
999 }
1000
1001 ev_ctx->vdev = vdev;
1002 ev_ctx->vdev_id = vdev->id;
1003 ev_ctx->cb_fn = cb_fn;
1004
1005 fn = foreach ? spdk_vhost_event_async_foreach_fn : spdk_vhost_event_async_fn;
1006 ev = spdk_event_allocate(ev_ctx->vdev->lcore, fn, ev_ctx, arg);
1007 assert(ev);
1008 spdk_event_call(ev);
1009
1010 return 0;
1011 }
1012
1013 static void
1014 stop_device(int vid)
1015 {
1016 struct spdk_vhost_dev *vdev;
1017 struct rte_vhost_vring *q;
1018 int rc;
1019 uint16_t i;
1020
1021 pthread_mutex_lock(&g_spdk_vhost_mutex);
1022 vdev = spdk_vhost_dev_find_by_vid(vid);
1023 if (vdev == NULL) {
1024 SPDK_ERRLOG("Couldn't find device with vid %d to stop.\n", vid);
1025 pthread_mutex_unlock(&g_spdk_vhost_mutex);
1026 return;
1027 }
1028
1029 if (vdev->lcore == -1) {
1030 SPDK_ERRLOG("Controller %s is not loaded.\n", vdev->name);
1031 pthread_mutex_unlock(&g_spdk_vhost_mutex);
1032 return;
1033 }
1034
1035 rc = _spdk_vhost_event_send(vdev, vdev->backend->stop_device, 3, "stop device");
1036 if (rc != 0) {
1037 SPDK_ERRLOG("Couldn't stop device with vid %d.\n", vid);
1038 pthread_mutex_unlock(&g_spdk_vhost_mutex);
1039 return;
1040 }
1041
1042 for (i = 0; i < vdev->max_queues; i++) {
1043 q = &vdev->virtqueue[i].vring;
1044 if (q->desc == NULL) {
1045 continue;
1046 }
1047 rte_vhost_set_vhost_vring_last_idx(vdev->vid, i, q->last_avail_idx, q->last_used_idx);
1048 }
1049
1050 spdk_vhost_dev_mem_unregister(vdev);
1051 free(vdev->mem);
1052 spdk_vhost_free_reactor(vdev->lcore);
1053 vdev->lcore = -1;
1054 pthread_mutex_unlock(&g_spdk_vhost_mutex);
1055 }
1056
1057 static int
1058 start_device(int vid)
1059 {
1060 struct spdk_vhost_dev *vdev;
1061 int rc = -1;
1062 uint16_t i;
1063
1064 pthread_mutex_lock(&g_spdk_vhost_mutex);
1065
1066 vdev = spdk_vhost_dev_find_by_vid(vid);
1067 if (vdev == NULL) {
1068 SPDK_ERRLOG("Controller with vid %d doesn't exist.\n", vid);
1069 goto out;
1070 }
1071
1072 if (vdev->lcore != -1) {
1073 SPDK_ERRLOG("Controller %s already loaded.\n", vdev->name);
1074 goto out;
1075 }
1076
1077 vdev->max_queues = 0;
1078 memset(vdev->virtqueue, 0, sizeof(vdev->virtqueue));
1079 for (i = 0; i < SPDK_VHOST_MAX_VQUEUES; i++) {
1080 if (rte_vhost_get_vhost_vring(vid, i, &vdev->virtqueue[i].vring)) {
1081 continue;
1082 }
1083
1084 if (vdev->virtqueue[i].vring.desc == NULL ||
1085 vdev->virtqueue[i].vring.size == 0) {
1086 continue;
1087 }
1088
1089 /* Disable notifications. */
1090 if (rte_vhost_enable_guest_notification(vid, i, 0) != 0) {
1091 SPDK_ERRLOG("vhost device %d: Failed to disable guest notification on queue %"PRIu16"\n", vid, i);
1092 goto out;
1093 }
1094
1095 vdev->max_queues = i + 1;
1096 }
1097
1098 if (rte_vhost_get_negotiated_features(vid, &vdev->negotiated_features) != 0) {
1099 SPDK_ERRLOG("vhost device %d: Failed to get negotiated driver features\n", vid);
1100 goto out;
1101 }
1102
1103 if (rte_vhost_get_mem_table(vid, &vdev->mem) != 0) {
1104 SPDK_ERRLOG("vhost device %d: Failed to get guest memory table\n", vid);
1105 goto out;
1106 }
1107
1108 /*
1109 * Not sure right now but this look like some kind of QEMU bug and guest IO
1110 * might be frozed without kicking all queues after live-migration. This look like
1111 * the previous vhost instance failed to effectively deliver all interrupts before
1112 * the GET_VRING_BASE message. This shouldn't harm guest since spurious interrupts
1113 * should be ignored by guest virtio driver.
1114 *
1115 * Tested on QEMU 2.10.91 and 2.11.50.
1116 */
1117 for (i = 0; i < vdev->max_queues; i++) {
1118 if (vdev->virtqueue[i].vring.callfd != -1) {
1119 eventfd_write(vdev->virtqueue[i].vring.callfd, (eventfd_t)1);
1120 }
1121 }
1122
1123 vdev->lcore = spdk_vhost_allocate_reactor(vdev->cpumask);
1124 spdk_vhost_dev_mem_register(vdev);
1125 rc = _spdk_vhost_event_send(vdev, vdev->backend->start_device, 3, "start device");
1126 if (rc != 0) {
1127 spdk_vhost_dev_mem_unregister(vdev);
1128 free(vdev->mem);
1129 spdk_vhost_free_reactor(vdev->lcore);
1130 vdev->lcore = -1;
1131 }
1132
1133 out:
1134 pthread_mutex_unlock(&g_spdk_vhost_mutex);
1135 return rc;
1136 }
1137
1138 static int
1139 get_config(int vid, uint8_t *config, uint32_t len)
1140 {
1141 struct spdk_vhost_dev *vdev;
1142 int rc = -1;
1143
1144 pthread_mutex_lock(&g_spdk_vhost_mutex);
1145 vdev = spdk_vhost_dev_find_by_vid(vid);
1146 if (vdev == NULL) {
1147 SPDK_ERRLOG("Controller with vid %d doesn't exist.\n", vid);
1148 goto out;
1149 }
1150
1151 if (vdev->backend->vhost_get_config) {
1152 rc = vdev->backend->vhost_get_config(vdev, config, len);
1153 }
1154
1155 out:
1156 pthread_mutex_unlock(&g_spdk_vhost_mutex);
1157 return rc;
1158 }
1159
1160 static int
1161 set_config(int vid, uint8_t *config, uint32_t offset, uint32_t size, uint32_t flags)
1162 {
1163 struct spdk_vhost_dev *vdev;
1164 int rc = -1;
1165
1166 pthread_mutex_lock(&g_spdk_vhost_mutex);
1167 vdev = spdk_vhost_dev_find_by_vid(vid);
1168 if (vdev == NULL) {
1169 SPDK_ERRLOG("Controller with vid %d doesn't exist.\n", vid);
1170 goto out;
1171 }
1172
1173 if (vdev->backend->vhost_set_config) {
1174 rc = vdev->backend->vhost_set_config(vdev, config, offset, size, flags);
1175 }
1176
1177 out:
1178 pthread_mutex_unlock(&g_spdk_vhost_mutex);
1179 return rc;
1180 }
1181
1182 int
1183 spdk_vhost_set_socket_path(const char *basename)
1184 {
1185 int ret;
1186
1187 if (basename && strlen(basename) > 0) {
1188 ret = snprintf(dev_dirname, sizeof(dev_dirname) - 2, "%s", basename);
1189 if (ret <= 0) {
1190 return -EINVAL;
1191 }
1192 if ((size_t)ret >= sizeof(dev_dirname) - 2) {
1193 SPDK_ERRLOG("Char dev dir path length %d is too long\n", ret);
1194 return -EINVAL;
1195 }
1196
1197 if (dev_dirname[ret - 1] != '/') {
1198 dev_dirname[ret] = '/';
1199 dev_dirname[ret + 1] = '\0';
1200 }
1201 }
1202
1203 return 0;
1204 }
1205
1206 static void *
1207 session_shutdown(void *arg)
1208 {
1209 struct spdk_vhost_dev *vdev = NULL;
1210
1211 TAILQ_FOREACH(vdev, &g_spdk_vhost_devices, tailq) {
1212 rte_vhost_driver_unregister(vdev->path);
1213 vdev->registered = false;
1214 }
1215
1216 SPDK_INFOLOG(SPDK_LOG_VHOST, "Exiting\n");
1217 spdk_event_call((struct spdk_event *)arg);
1218 return NULL;
1219 }
1220
1221 void
1222 spdk_vhost_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
1223 {
1224 assert(vdev->backend->dump_info_json != NULL);
1225 vdev->backend->dump_info_json(vdev, w);
1226 }
1227
1228 int
1229 spdk_vhost_dev_remove(struct spdk_vhost_dev *vdev)
1230 {
1231 return vdev->backend->remove_device(vdev);
1232 }
1233
1234 static int
1235 new_connection(int vid)
1236 {
1237 struct spdk_vhost_dev *vdev;
1238 char ifname[PATH_MAX];
1239
1240 pthread_mutex_lock(&g_spdk_vhost_mutex);
1241 if (rte_vhost_get_ifname(vid, ifname, PATH_MAX) < 0) {
1242 SPDK_ERRLOG("Couldn't get a valid ifname for device with vid %d\n", vid);
1243 pthread_mutex_unlock(&g_spdk_vhost_mutex);
1244 return -1;
1245 }
1246
1247 vdev = spdk_vhost_dev_find(ifname);
1248 if (vdev == NULL) {
1249 SPDK_ERRLOG("Couldn't find device with vid %d to create connection for.\n", vid);
1250 pthread_mutex_unlock(&g_spdk_vhost_mutex);
1251 return -1;
1252 }
1253
1254 /* since pollers are not running it safe not to use spdk_event here */
1255 if (vdev->vid != -1) {
1256 SPDK_ERRLOG("Device with vid %d is already connected.\n", vid);
1257 pthread_mutex_unlock(&g_spdk_vhost_mutex);
1258 return -1;
1259 }
1260
1261 vdev->vid = vid;
1262 pthread_mutex_unlock(&g_spdk_vhost_mutex);
1263 return 0;
1264 }
1265
1266 static void
1267 destroy_connection(int vid)
1268 {
1269 struct spdk_vhost_dev *vdev;
1270
1271 pthread_mutex_lock(&g_spdk_vhost_mutex);
1272 vdev = spdk_vhost_dev_find_by_vid(vid);
1273 if (vdev == NULL) {
1274 SPDK_ERRLOG("Couldn't find device with vid %d to destroy connection for.\n", vid);
1275 pthread_mutex_unlock(&g_spdk_vhost_mutex);
1276 return;
1277 }
1278
1279 /* since pollers are not running it safe not to use spdk_event here */
1280 vdev->vid = -1;
1281 pthread_mutex_unlock(&g_spdk_vhost_mutex);
1282 }
1283
1284 void
1285 spdk_vhost_call_external_event(const char *ctrlr_name, spdk_vhost_event_fn fn, void *arg)
1286 {
1287 struct spdk_vhost_dev *vdev;
1288
1289 pthread_mutex_lock(&g_spdk_vhost_mutex);
1290 vdev = spdk_vhost_dev_find(ctrlr_name);
1291
1292 if (vdev == NULL) {
1293 pthread_mutex_unlock(&g_spdk_vhost_mutex);
1294 fn(NULL, arg);
1295 return;
1296 }
1297
1298 if (vdev->lcore == -1) {
1299 fn(vdev, arg);
1300 } else {
1301 spdk_vhost_event_async_send(vdev, fn, arg, false);
1302 }
1303
1304 pthread_mutex_unlock(&g_spdk_vhost_mutex);
1305 }
1306
1307 static void
1308 spdk_vhost_external_event_foreach_continue(struct spdk_vhost_dev *vdev,
1309 spdk_vhost_event_fn fn, void *arg)
1310 {
1311 if (vdev == NULL) {
1312 fn(NULL, arg);
1313 return;
1314 }
1315
1316 while (vdev->lcore == -1) {
1317 fn(vdev, arg);
1318 vdev = spdk_vhost_dev_next(vdev->id);
1319 if (vdev == NULL) {
1320 fn(NULL, arg);
1321 return;
1322 }
1323 }
1324
1325 spdk_vhost_event_async_send(vdev, fn, arg, true);
1326 }
1327
1328 void
1329 spdk_vhost_call_external_event_foreach(spdk_vhost_event_fn fn, void *arg)
1330 {
1331 struct spdk_vhost_dev *vdev;
1332
1333 pthread_mutex_lock(&g_spdk_vhost_mutex);
1334 vdev = TAILQ_FIRST(&g_spdk_vhost_devices);
1335 spdk_vhost_external_event_foreach_continue(vdev, fn, arg);
1336 pthread_mutex_unlock(&g_spdk_vhost_mutex);
1337 }
1338
1339 void
1340 spdk_vhost_lock(void)
1341 {
1342 pthread_mutex_lock(&g_spdk_vhost_mutex);
1343 }
1344
1345 void
1346 spdk_vhost_unlock(void)
1347 {
1348 pthread_mutex_unlock(&g_spdk_vhost_mutex);
1349 }
1350
1351 int
1352 spdk_vhost_init(void)
1353 {
1354 uint32_t last_core;
1355 size_t len;
1356 int ret;
1357
1358 if (dev_dirname[0] == '\0') {
1359 if (getcwd(dev_dirname, sizeof(dev_dirname) - 1) == NULL) {
1360 SPDK_ERRLOG("getcwd failed (%d): %s\n", errno, spdk_strerror(errno));
1361 return -1;
1362 }
1363
1364 len = strlen(dev_dirname);
1365 if (dev_dirname[len - 1] != '/') {
1366 dev_dirname[len] = '/';
1367 dev_dirname[len + 1] = '\0';
1368 }
1369 }
1370
1371 last_core = spdk_env_get_last_core();
1372 g_num_ctrlrs = calloc(last_core + 1, sizeof(uint32_t));
1373 if (!g_num_ctrlrs) {
1374 SPDK_ERRLOG("Could not allocate array size=%u for g_num_ctrlrs\n",
1375 last_core + 1);
1376 return -1;
1377 }
1378
1379 ret = spdk_vhost_scsi_controller_construct();
1380 if (ret != 0) {
1381 SPDK_ERRLOG("Cannot construct vhost controllers\n");
1382 return -1;
1383 }
1384
1385 ret = spdk_vhost_blk_controller_construct();
1386 if (ret != 0) {
1387 SPDK_ERRLOG("Cannot construct vhost block controllers\n");
1388 return -1;
1389 }
1390
1391 ret = spdk_vhost_nvme_controller_construct();
1392 if (ret != 0) {
1393 SPDK_ERRLOG("Cannot construct vhost NVMe controllers\n");
1394 return -1;
1395 }
1396
1397 return 0;
1398 }
1399
1400 static int
1401 _spdk_vhost_fini_remove_vdev_cb(struct spdk_vhost_dev *vdev, void *arg)
1402 {
1403 spdk_vhost_fini_cb fini_cb = arg;
1404
1405 if (vdev != NULL) {
1406 spdk_vhost_dev_remove(vdev);
1407 return 0;
1408 }
1409
1410 /* All devices are removed now. */
1411 free(g_num_ctrlrs);
1412 fini_cb();
1413 return 0;
1414 }
1415
1416 static void
1417 _spdk_vhost_fini(void *arg1, void *arg2)
1418 {
1419 spdk_vhost_fini_cb fini_cb = arg1;
1420
1421 spdk_vhost_call_external_event_foreach(_spdk_vhost_fini_remove_vdev_cb, fini_cb);
1422 }
1423
1424 void
1425 spdk_vhost_fini(spdk_vhost_fini_cb fini_cb)
1426 {
1427 pthread_t tid;
1428 int rc;
1429 struct spdk_event *fini_ev;
1430
1431 fini_ev = spdk_event_allocate(spdk_env_get_current_core(), _spdk_vhost_fini, fini_cb, NULL);
1432
1433 /* rte_vhost API for removing sockets is not asynchronous. Since it may call SPDK
1434 * ops for stopping a device or removing a connection, we need to call it from
1435 * a separate thread to avoid deadlock.
1436 */
1437 rc = pthread_create(&tid, NULL, &session_shutdown, fini_ev);
1438 if (rc < 0) {
1439 SPDK_ERRLOG("Failed to start session shutdown thread (%d): %s\n", rc, spdk_strerror(rc));
1440 abort();
1441 }
1442 pthread_detach(tid);
1443 }
1444
1445 struct spdk_vhost_write_config_json_ctx {
1446 struct spdk_json_write_ctx *w;
1447 struct spdk_event *done_ev;
1448 };
1449
1450 static int
1451 spdk_vhost_config_json_cb(struct spdk_vhost_dev *vdev, void *arg)
1452 {
1453 struct spdk_vhost_write_config_json_ctx *ctx = arg;
1454 uint32_t delay_base_us;
1455 uint32_t iops_threshold;
1456
1457 if (vdev == NULL) {
1458 spdk_json_write_array_end(ctx->w);
1459 spdk_event_call(ctx->done_ev);
1460 free(ctx);
1461 return 0;
1462 }
1463
1464 vdev->backend->write_config_json(vdev, ctx->w);
1465
1466 spdk_vhost_get_coalescing(vdev, &delay_base_us, &iops_threshold);
1467 if (delay_base_us) {
1468 spdk_json_write_object_begin(ctx->w);
1469 spdk_json_write_named_string(ctx->w, "method", "set_vhost_controller_coalescing");
1470
1471 spdk_json_write_named_object_begin(ctx->w, "params");
1472 spdk_json_write_named_string(ctx->w, "ctrlr", vdev->name);
1473 spdk_json_write_named_uint32(ctx->w, "delay_base_us", delay_base_us);
1474 spdk_json_write_named_uint32(ctx->w, "iops_threshold", iops_threshold);
1475 spdk_json_write_object_end(ctx->w);
1476
1477 spdk_json_write_object_end(ctx->w);
1478 }
1479
1480 return 0;
1481 }
1482
1483 void
1484 spdk_vhost_config_json(struct spdk_json_write_ctx *w, struct spdk_event *done_ev)
1485 {
1486 struct spdk_vhost_write_config_json_ctx *ctx;
1487
1488 ctx = calloc(1, sizeof(*ctx));
1489 if (!ctx) {
1490 spdk_event_call(done_ev);
1491 return;
1492 }
1493
1494 ctx->w = w;
1495 ctx->done_ev = done_ev;
1496
1497 spdk_json_write_array_begin(w);
1498
1499 spdk_vhost_call_external_event_foreach(spdk_vhost_config_json_cb, ctx);
1500 }
1501
1502 SPDK_LOG_REGISTER_COMPONENT("vhost", SPDK_LOG_VHOST)
1503 SPDK_LOG_REGISTER_COMPONENT("vhost_ring", SPDK_LOG_VHOST_RING)