4 * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 #include <linux/virtio_net.h>
39 #include <rte_memcpy.h>
40 #include <rte_ether.h>
42 #include <rte_virtio_net.h>
50 #define MAX_PKT_BURST 32
51 #define VHOST_LOG_PAGE 4096
53 static inline void __attribute__((always_inline
))
54 vhost_log_page(uint8_t *log_base
, uint64_t page
)
56 log_base
[page
/ 8] |= 1 << (page
% 8);
59 static inline void __attribute__((always_inline
))
60 vhost_log_write(struct virtio_net
*dev
, uint64_t addr
, uint64_t len
)
64 if (likely(((dev
->features
& (1ULL << VHOST_F_LOG_ALL
)) == 0) ||
65 !dev
->log_base
|| !len
))
68 if (unlikely(dev
->log_size
<= ((addr
+ len
- 1) / VHOST_LOG_PAGE
/ 8)))
71 /* To make sure guest memory updates are committed before logging */
74 page
= addr
/ VHOST_LOG_PAGE
;
75 while (page
* VHOST_LOG_PAGE
< addr
+ len
) {
76 vhost_log_page((uint8_t *)(uintptr_t)dev
->log_base
, page
);
81 static inline void __attribute__((always_inline
))
82 vhost_log_used_vring(struct virtio_net
*dev
, struct vhost_virtqueue
*vq
,
83 uint64_t offset
, uint64_t len
)
85 vhost_log_write(dev
, vq
->log_guest_addr
+ offset
, len
);
89 is_valid_virt_queue_idx(uint32_t idx
, int is_tx
, uint32_t qp_nb
)
91 return (is_tx
^ (idx
& 1)) == 0 && idx
< qp_nb
* VIRTIO_QNUM
;
94 static inline void __attribute__((always_inline
))
95 do_flush_shadow_used_ring(struct virtio_net
*dev
, struct vhost_virtqueue
*vq
,
96 uint16_t to
, uint16_t from
, uint16_t size
)
98 rte_memcpy(&vq
->used
->ring
[to
],
99 &vq
->shadow_used_ring
[from
],
100 size
* sizeof(struct vring_used_elem
));
101 vhost_log_used_vring(dev
, vq
,
102 offsetof(struct vring_used
, ring
[to
]),
103 size
* sizeof(struct vring_used_elem
));
106 static inline void __attribute__((always_inline
))
107 flush_shadow_used_ring(struct virtio_net
*dev
, struct vhost_virtqueue
*vq
)
109 uint16_t used_idx
= vq
->last_used_idx
& (vq
->size
- 1);
111 if (used_idx
+ vq
->shadow_used_idx
<= vq
->size
) {
112 do_flush_shadow_used_ring(dev
, vq
, used_idx
, 0,
113 vq
->shadow_used_idx
);
117 /* update used ring interval [used_idx, vq->size] */
118 size
= vq
->size
- used_idx
;
119 do_flush_shadow_used_ring(dev
, vq
, used_idx
, 0, size
);
121 /* update the left half used ring interval [0, left_size] */
122 do_flush_shadow_used_ring(dev
, vq
, 0, size
,
123 vq
->shadow_used_idx
- size
);
125 vq
->last_used_idx
+= vq
->shadow_used_idx
;
129 *(volatile uint16_t *)&vq
->used
->idx
+= vq
->shadow_used_idx
;
130 vhost_log_used_vring(dev
, vq
, offsetof(struct vring_used
, idx
),
131 sizeof(vq
->used
->idx
));
134 static inline void __attribute__((always_inline
))
135 update_shadow_used_ring(struct vhost_virtqueue
*vq
,
136 uint16_t desc_idx
, uint16_t len
)
138 uint16_t i
= vq
->shadow_used_idx
++;
140 vq
->shadow_used_ring
[i
].id
= desc_idx
;
141 vq
->shadow_used_ring
[i
].len
= len
;
145 virtio_enqueue_offload(struct rte_mbuf
*m_buf
, struct virtio_net_hdr
*net_hdr
)
147 if (m_buf
->ol_flags
& PKT_TX_L4_MASK
) {
148 net_hdr
->flags
= VIRTIO_NET_HDR_F_NEEDS_CSUM
;
149 net_hdr
->csum_start
= m_buf
->l2_len
+ m_buf
->l3_len
;
151 switch (m_buf
->ol_flags
& PKT_TX_L4_MASK
) {
152 case PKT_TX_TCP_CKSUM
:
153 net_hdr
->csum_offset
= (offsetof(struct tcp_hdr
,
156 case PKT_TX_UDP_CKSUM
:
157 net_hdr
->csum_offset
= (offsetof(struct udp_hdr
,
160 case PKT_TX_SCTP_CKSUM
:
161 net_hdr
->csum_offset
= (offsetof(struct sctp_hdr
,
167 if (m_buf
->ol_flags
& PKT_TX_TCP_SEG
) {
168 if (m_buf
->ol_flags
& PKT_TX_IPV4
)
169 net_hdr
->gso_type
= VIRTIO_NET_HDR_GSO_TCPV4
;
171 net_hdr
->gso_type
= VIRTIO_NET_HDR_GSO_TCPV6
;
172 net_hdr
->gso_size
= m_buf
->tso_segsz
;
173 net_hdr
->hdr_len
= m_buf
->l2_len
+ m_buf
->l3_len
179 copy_virtio_net_hdr(struct virtio_net
*dev
, uint64_t desc_addr
,
180 struct virtio_net_hdr_mrg_rxbuf hdr
)
182 if (dev
->vhost_hlen
== sizeof(struct virtio_net_hdr_mrg_rxbuf
))
183 *(struct virtio_net_hdr_mrg_rxbuf
*)(uintptr_t)desc_addr
= hdr
;
185 *(struct virtio_net_hdr
*)(uintptr_t)desc_addr
= hdr
.hdr
;
188 static inline int __attribute__((always_inline
))
189 copy_mbuf_to_desc(struct virtio_net
*dev
, struct vring_desc
*descs
,
190 struct rte_mbuf
*m
, uint16_t desc_idx
, uint32_t size
)
192 uint32_t desc_avail
, desc_offset
;
193 uint32_t mbuf_avail
, mbuf_offset
;
195 struct vring_desc
*desc
;
197 struct virtio_net_hdr_mrg_rxbuf virtio_hdr
= {{0, 0, 0, 0, 0, 0}, 0};
199 desc
= &descs
[desc_idx
];
200 desc_addr
= gpa_to_vva(dev
, desc
->addr
);
202 * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
203 * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
204 * otherwise stores offset on the stack instead of in a register.
206 if (unlikely(desc
->len
< dev
->vhost_hlen
) || !desc_addr
)
209 rte_prefetch0((void *)(uintptr_t)desc_addr
);
211 virtio_enqueue_offload(m
, &virtio_hdr
.hdr
);
212 copy_virtio_net_hdr(dev
, desc_addr
, virtio_hdr
);
213 vhost_log_write(dev
, desc
->addr
, dev
->vhost_hlen
);
214 PRINT_PACKET(dev
, (uintptr_t)desc_addr
, dev
->vhost_hlen
, 0);
216 desc_offset
= dev
->vhost_hlen
;
217 desc_avail
= desc
->len
- dev
->vhost_hlen
;
219 mbuf_avail
= rte_pktmbuf_data_len(m
);
221 while (mbuf_avail
!= 0 || m
->next
!= NULL
) {
222 /* done with current mbuf, fetch next */
223 if (mbuf_avail
== 0) {
227 mbuf_avail
= rte_pktmbuf_data_len(m
);
230 /* done with current desc buf, fetch next */
231 if (desc_avail
== 0) {
232 if ((desc
->flags
& VRING_DESC_F_NEXT
) == 0) {
233 /* Room in vring buffer is not enough */
236 if (unlikely(desc
->next
>= size
))
239 desc
= &descs
[desc
->next
];
240 desc_addr
= gpa_to_vva(dev
, desc
->addr
);
241 if (unlikely(!desc_addr
))
245 desc_avail
= desc
->len
;
248 cpy_len
= RTE_MIN(desc_avail
, mbuf_avail
);
249 rte_memcpy((void *)((uintptr_t)(desc_addr
+ desc_offset
)),
250 rte_pktmbuf_mtod_offset(m
, void *, mbuf_offset
),
252 vhost_log_write(dev
, desc
->addr
+ desc_offset
, cpy_len
);
253 PRINT_PACKET(dev
, (uintptr_t)(desc_addr
+ desc_offset
),
256 mbuf_avail
-= cpy_len
;
257 mbuf_offset
+= cpy_len
;
258 desc_avail
-= cpy_len
;
259 desc_offset
+= cpy_len
;
266 * This function adds buffers to the virtio devices RX virtqueue. Buffers can
267 * be received from the physical port or from another virtio device. A packet
268 * count is returned to indicate the number of packets that are succesfully
269 * added to the RX queue. This function works when the mbuf is scattered, but
270 * it doesn't support the mergeable feature.
272 static inline uint32_t __attribute__((always_inline
))
273 virtio_dev_rx(struct virtio_net
*dev
, uint16_t queue_id
,
274 struct rte_mbuf
**pkts
, uint32_t count
)
276 struct vhost_virtqueue
*vq
;
277 uint16_t avail_idx
, free_entries
, start_idx
;
278 uint16_t desc_indexes
[MAX_PKT_BURST
];
279 struct vring_desc
*descs
;
283 LOG_DEBUG(VHOST_DATA
, "(%d) %s\n", dev
->vid
, __func__
);
284 if (unlikely(!is_valid_virt_queue_idx(queue_id
, 0, dev
->virt_qp_nb
))) {
285 RTE_LOG(ERR
, VHOST_DATA
, "(%d) %s: invalid virtqueue idx %d.\n",
286 dev
->vid
, __func__
, queue_id
);
290 vq
= dev
->virtqueue
[queue_id
];
291 if (unlikely(vq
->enabled
== 0))
294 avail_idx
= *((volatile uint16_t *)&vq
->avail
->idx
);
295 start_idx
= vq
->last_used_idx
;
296 free_entries
= avail_idx
- start_idx
;
297 count
= RTE_MIN(count
, free_entries
);
298 count
= RTE_MIN(count
, (uint32_t)MAX_PKT_BURST
);
302 LOG_DEBUG(VHOST_DATA
, "(%d) start_idx %d | end_idx %d\n",
303 dev
->vid
, start_idx
, start_idx
+ count
);
305 /* Retrieve all of the desc indexes first to avoid caching issues. */
306 rte_prefetch0(&vq
->avail
->ring
[start_idx
& (vq
->size
- 1)]);
307 for (i
= 0; i
< count
; i
++) {
308 used_idx
= (start_idx
+ i
) & (vq
->size
- 1);
309 desc_indexes
[i
] = vq
->avail
->ring
[used_idx
];
310 vq
->used
->ring
[used_idx
].id
= desc_indexes
[i
];
311 vq
->used
->ring
[used_idx
].len
= pkts
[i
]->pkt_len
+
313 vhost_log_used_vring(dev
, vq
,
314 offsetof(struct vring_used
, ring
[used_idx
]),
315 sizeof(vq
->used
->ring
[used_idx
]));
318 rte_prefetch0(&vq
->desc
[desc_indexes
[0]]);
319 for (i
= 0; i
< count
; i
++) {
320 uint16_t desc_idx
= desc_indexes
[i
];
323 if (vq
->desc
[desc_idx
].flags
& VRING_DESC_F_INDIRECT
) {
324 descs
= (struct vring_desc
*)(uintptr_t)gpa_to_vva(dev
,
325 vq
->desc
[desc_idx
].addr
);
326 if (unlikely(!descs
)) {
332 sz
= vq
->desc
[desc_idx
].len
/ sizeof(*descs
);
338 err
= copy_mbuf_to_desc(dev
, descs
, pkts
[i
], desc_idx
, sz
);
340 used_idx
= (start_idx
+ i
) & (vq
->size
- 1);
341 vq
->used
->ring
[used_idx
].len
= dev
->vhost_hlen
;
342 vhost_log_used_vring(dev
, vq
,
343 offsetof(struct vring_used
, ring
[used_idx
]),
344 sizeof(vq
->used
->ring
[used_idx
]));
348 rte_prefetch0(&vq
->desc
[desc_indexes
[i
+1]]);
353 *(volatile uint16_t *)&vq
->used
->idx
+= count
;
354 vq
->last_used_idx
+= count
;
355 vhost_log_used_vring(dev
, vq
,
356 offsetof(struct vring_used
, idx
),
357 sizeof(vq
->used
->idx
));
359 /* flush used->idx update before we read avail->flags. */
362 /* Kick the guest if necessary. */
363 if (!(vq
->avail
->flags
& VRING_AVAIL_F_NO_INTERRUPT
)
364 && (vq
->callfd
>= 0))
365 eventfd_write(vq
->callfd
, (eventfd_t
)1);
369 static inline int __attribute__((always_inline
))
370 fill_vec_buf(struct virtio_net
*dev
, struct vhost_virtqueue
*vq
,
371 uint32_t avail_idx
, uint32_t *vec_idx
,
372 struct buf_vector
*buf_vec
, uint16_t *desc_chain_head
,
373 uint16_t *desc_chain_len
)
375 uint16_t idx
= vq
->avail
->ring
[avail_idx
& (vq
->size
- 1)];
376 uint32_t vec_id
= *vec_idx
;
378 struct vring_desc
*descs
= vq
->desc
;
380 *desc_chain_head
= idx
;
382 if (vq
->desc
[idx
].flags
& VRING_DESC_F_INDIRECT
) {
383 descs
= (struct vring_desc
*)(uintptr_t)
384 gpa_to_vva(dev
, vq
->desc
[idx
].addr
);
385 if (unlikely(!descs
))
392 if (unlikely(vec_id
>= BUF_VECTOR_MAX
|| idx
>= vq
->size
))
395 len
+= descs
[idx
].len
;
396 buf_vec
[vec_id
].buf_addr
= descs
[idx
].addr
;
397 buf_vec
[vec_id
].buf_len
= descs
[idx
].len
;
398 buf_vec
[vec_id
].desc_idx
= idx
;
401 if ((descs
[idx
].flags
& VRING_DESC_F_NEXT
) == 0)
404 idx
= descs
[idx
].next
;
407 *desc_chain_len
= len
;
414 * Returns -1 on fail, 0 on success
417 reserve_avail_buf_mergeable(struct virtio_net
*dev
, struct vhost_virtqueue
*vq
,
418 uint32_t size
, struct buf_vector
*buf_vec
,
419 uint16_t *num_buffers
, uint16_t avail_head
)
422 uint32_t vec_idx
= 0;
425 uint16_t head_idx
= 0;
429 cur_idx
= vq
->last_avail_idx
;
432 if (unlikely(cur_idx
== avail_head
))
435 if (unlikely(fill_vec_buf(dev
, vq
, cur_idx
, &vec_idx
, buf_vec
,
436 &head_idx
, &len
) < 0))
438 len
= RTE_MIN(len
, size
);
439 update_shadow_used_ring(vq
, head_idx
, len
);
447 * if we tried all available ring items, and still
448 * can't get enough buf, it means something abnormal
451 if (unlikely(tries
>= vq
->size
))
458 static inline int __attribute__((always_inline
))
459 copy_mbuf_to_desc_mergeable(struct virtio_net
*dev
, struct rte_mbuf
*m
,
460 struct buf_vector
*buf_vec
, uint16_t num_buffers
)
462 struct virtio_net_hdr_mrg_rxbuf virtio_hdr
= {{0, 0, 0, 0, 0, 0}, 0};
463 uint32_t vec_idx
= 0;
465 uint32_t mbuf_offset
, mbuf_avail
;
466 uint32_t desc_offset
, desc_avail
;
468 uint64_t hdr_addr
, hdr_phys_addr
;
469 struct rte_mbuf
*hdr_mbuf
;
471 if (unlikely(m
== NULL
))
474 desc_addr
= gpa_to_vva(dev
, buf_vec
[vec_idx
].buf_addr
);
475 if (buf_vec
[vec_idx
].buf_len
< dev
->vhost_hlen
|| !desc_addr
)
479 hdr_addr
= desc_addr
;
480 hdr_phys_addr
= buf_vec
[vec_idx
].buf_addr
;
481 rte_prefetch0((void *)(uintptr_t)hdr_addr
);
483 virtio_hdr
.num_buffers
= num_buffers
;
484 LOG_DEBUG(VHOST_DATA
, "(%d) RX: num merge buffers %d\n",
485 dev
->vid
, num_buffers
);
487 desc_avail
= buf_vec
[vec_idx
].buf_len
- dev
->vhost_hlen
;
488 desc_offset
= dev
->vhost_hlen
;
490 mbuf_avail
= rte_pktmbuf_data_len(m
);
492 while (mbuf_avail
!= 0 || m
->next
!= NULL
) {
493 /* done with current desc buf, get the next one */
494 if (desc_avail
== 0) {
496 desc_addr
= gpa_to_vva(dev
, buf_vec
[vec_idx
].buf_addr
);
497 if (unlikely(!desc_addr
))
500 /* Prefetch buffer address. */
501 rte_prefetch0((void *)(uintptr_t)desc_addr
);
503 desc_avail
= buf_vec
[vec_idx
].buf_len
;
506 /* done with current mbuf, get the next one */
507 if (mbuf_avail
== 0) {
511 mbuf_avail
= rte_pktmbuf_data_len(m
);
515 virtio_enqueue_offload(hdr_mbuf
, &virtio_hdr
.hdr
);
516 copy_virtio_net_hdr(dev
, hdr_addr
, virtio_hdr
);
517 vhost_log_write(dev
, hdr_phys_addr
, dev
->vhost_hlen
);
518 PRINT_PACKET(dev
, (uintptr_t)hdr_addr
,
524 cpy_len
= RTE_MIN(desc_avail
, mbuf_avail
);
525 rte_memcpy((void *)((uintptr_t)(desc_addr
+ desc_offset
)),
526 rte_pktmbuf_mtod_offset(m
, void *, mbuf_offset
),
528 vhost_log_write(dev
, buf_vec
[vec_idx
].buf_addr
+ desc_offset
,
530 PRINT_PACKET(dev
, (uintptr_t)(desc_addr
+ desc_offset
),
533 mbuf_avail
-= cpy_len
;
534 mbuf_offset
+= cpy_len
;
535 desc_avail
-= cpy_len
;
536 desc_offset
+= cpy_len
;
542 static inline uint32_t __attribute__((always_inline
))
543 virtio_dev_merge_rx(struct virtio_net
*dev
, uint16_t queue_id
,
544 struct rte_mbuf
**pkts
, uint32_t count
)
546 struct vhost_virtqueue
*vq
;
547 uint32_t pkt_idx
= 0;
548 uint16_t num_buffers
;
549 struct buf_vector buf_vec
[BUF_VECTOR_MAX
];
552 LOG_DEBUG(VHOST_DATA
, "(%d) %s\n", dev
->vid
, __func__
);
553 if (unlikely(!is_valid_virt_queue_idx(queue_id
, 0, dev
->virt_qp_nb
))) {
554 RTE_LOG(ERR
, VHOST_DATA
, "(%d) %s: invalid virtqueue idx %d.\n",
555 dev
->vid
, __func__
, queue_id
);
559 vq
= dev
->virtqueue
[queue_id
];
560 if (unlikely(vq
->enabled
== 0))
563 count
= RTE_MIN((uint32_t)MAX_PKT_BURST
, count
);
567 rte_prefetch0(&vq
->avail
->ring
[vq
->last_avail_idx
& (vq
->size
- 1)]);
569 vq
->shadow_used_idx
= 0;
570 avail_head
= *((volatile uint16_t *)&vq
->avail
->idx
);
571 for (pkt_idx
= 0; pkt_idx
< count
; pkt_idx
++) {
572 uint32_t pkt_len
= pkts
[pkt_idx
]->pkt_len
+ dev
->vhost_hlen
;
574 if (unlikely(reserve_avail_buf_mergeable(dev
, vq
,
575 pkt_len
, buf_vec
, &num_buffers
,
577 LOG_DEBUG(VHOST_DATA
,
578 "(%d) failed to get enough desc from vring\n",
580 vq
->shadow_used_idx
-= num_buffers
;
584 LOG_DEBUG(VHOST_DATA
, "(%d) current index %d | end index %d\n",
585 dev
->vid
, vq
->last_avail_idx
,
586 vq
->last_avail_idx
+ num_buffers
);
588 if (copy_mbuf_to_desc_mergeable(dev
, pkts
[pkt_idx
],
589 buf_vec
, num_buffers
) < 0) {
590 vq
->shadow_used_idx
-= num_buffers
;
594 vq
->last_avail_idx
+= num_buffers
;
597 if (likely(vq
->shadow_used_idx
)) {
598 flush_shadow_used_ring(dev
, vq
);
600 /* flush used->idx update before we read avail->flags. */
603 /* Kick the guest if necessary. */
604 if (!(vq
->avail
->flags
& VRING_AVAIL_F_NO_INTERRUPT
)
605 && (vq
->callfd
>= 0))
606 eventfd_write(vq
->callfd
, (eventfd_t
)1);
613 rte_vhost_enqueue_burst(int vid
, uint16_t queue_id
,
614 struct rte_mbuf
**pkts
, uint16_t count
)
616 struct virtio_net
*dev
= get_device(vid
);
621 if (dev
->features
& (1 << VIRTIO_NET_F_MRG_RXBUF
))
622 return virtio_dev_merge_rx(dev
, queue_id
, pkts
, count
);
624 return virtio_dev_rx(dev
, queue_id
, pkts
, count
);
628 virtio_net_with_host_offload(struct virtio_net
*dev
)
631 (VIRTIO_NET_F_CSUM
| VIRTIO_NET_F_HOST_ECN
|
632 VIRTIO_NET_F_HOST_TSO4
| VIRTIO_NET_F_HOST_TSO6
|
633 VIRTIO_NET_F_HOST_UFO
))
640 parse_ethernet(struct rte_mbuf
*m
, uint16_t *l4_proto
, void **l4_hdr
)
642 struct ipv4_hdr
*ipv4_hdr
;
643 struct ipv6_hdr
*ipv6_hdr
;
645 struct ether_hdr
*eth_hdr
;
648 eth_hdr
= rte_pktmbuf_mtod(m
, struct ether_hdr
*);
650 m
->l2_len
= sizeof(struct ether_hdr
);
651 ethertype
= rte_be_to_cpu_16(eth_hdr
->ether_type
);
653 if (ethertype
== ETHER_TYPE_VLAN
) {
654 struct vlan_hdr
*vlan_hdr
= (struct vlan_hdr
*)(eth_hdr
+ 1);
656 m
->l2_len
+= sizeof(struct vlan_hdr
);
657 ethertype
= rte_be_to_cpu_16(vlan_hdr
->eth_proto
);
660 l3_hdr
= (char *)eth_hdr
+ m
->l2_len
;
663 case ETHER_TYPE_IPv4
:
664 ipv4_hdr
= (struct ipv4_hdr
*)l3_hdr
;
665 *l4_proto
= ipv4_hdr
->next_proto_id
;
666 m
->l3_len
= (ipv4_hdr
->version_ihl
& 0x0f) * 4;
667 *l4_hdr
= (char *)l3_hdr
+ m
->l3_len
;
668 m
->ol_flags
|= PKT_TX_IPV4
;
670 case ETHER_TYPE_IPv6
:
671 ipv6_hdr
= (struct ipv6_hdr
*)l3_hdr
;
672 *l4_proto
= ipv6_hdr
->proto
;
673 m
->l3_len
= sizeof(struct ipv6_hdr
);
674 *l4_hdr
= (char *)l3_hdr
+ m
->l3_len
;
675 m
->ol_flags
|= PKT_TX_IPV6
;
684 static inline void __attribute__((always_inline
))
685 vhost_dequeue_offload(struct virtio_net_hdr
*hdr
, struct rte_mbuf
*m
)
687 uint16_t l4_proto
= 0;
689 struct tcp_hdr
*tcp_hdr
= NULL
;
691 if (hdr
->flags
== 0 && hdr
->gso_type
== VIRTIO_NET_HDR_GSO_NONE
)
694 parse_ethernet(m
, &l4_proto
, &l4_hdr
);
695 if (hdr
->flags
== VIRTIO_NET_HDR_F_NEEDS_CSUM
) {
696 if (hdr
->csum_start
== (m
->l2_len
+ m
->l3_len
)) {
697 switch (hdr
->csum_offset
) {
698 case (offsetof(struct tcp_hdr
, cksum
)):
699 if (l4_proto
== IPPROTO_TCP
)
700 m
->ol_flags
|= PKT_TX_TCP_CKSUM
;
702 case (offsetof(struct udp_hdr
, dgram_cksum
)):
703 if (l4_proto
== IPPROTO_UDP
)
704 m
->ol_flags
|= PKT_TX_UDP_CKSUM
;
706 case (offsetof(struct sctp_hdr
, cksum
)):
707 if (l4_proto
== IPPROTO_SCTP
)
708 m
->ol_flags
|= PKT_TX_SCTP_CKSUM
;
716 if (hdr
->gso_type
!= VIRTIO_NET_HDR_GSO_NONE
) {
717 switch (hdr
->gso_type
& ~VIRTIO_NET_HDR_GSO_ECN
) {
718 case VIRTIO_NET_HDR_GSO_TCPV4
:
719 case VIRTIO_NET_HDR_GSO_TCPV6
:
720 tcp_hdr
= (struct tcp_hdr
*)l4_hdr
;
721 m
->ol_flags
|= PKT_TX_TCP_SEG
;
722 m
->tso_segsz
= hdr
->gso_size
;
723 m
->l4_len
= (tcp_hdr
->data_off
& 0xf0) >> 2;
726 RTE_LOG(WARNING
, VHOST_DATA
,
727 "unsupported gso type %u.\n", hdr
->gso_type
);
733 #define RARP_PKT_SIZE 64
736 make_rarp_packet(struct rte_mbuf
*rarp_mbuf
, const struct ether_addr
*mac
)
738 struct ether_hdr
*eth_hdr
;
739 struct arp_hdr
*rarp
;
741 if (rarp_mbuf
->buf_len
< 64) {
742 RTE_LOG(WARNING
, VHOST_DATA
,
743 "failed to make RARP; mbuf size too small %u (< %d)\n",
744 rarp_mbuf
->buf_len
, RARP_PKT_SIZE
);
748 /* Ethernet header. */
749 eth_hdr
= rte_pktmbuf_mtod_offset(rarp_mbuf
, struct ether_hdr
*, 0);
750 memset(eth_hdr
->d_addr
.addr_bytes
, 0xff, ETHER_ADDR_LEN
);
751 ether_addr_copy(mac
, ð_hdr
->s_addr
);
752 eth_hdr
->ether_type
= htons(ETHER_TYPE_RARP
);
755 rarp
= (struct arp_hdr
*)(eth_hdr
+ 1);
756 rarp
->arp_hrd
= htons(ARP_HRD_ETHER
);
757 rarp
->arp_pro
= htons(ETHER_TYPE_IPv4
);
758 rarp
->arp_hln
= ETHER_ADDR_LEN
;
760 rarp
->arp_op
= htons(ARP_OP_REVREQUEST
);
762 ether_addr_copy(mac
, &rarp
->arp_data
.arp_sha
);
763 ether_addr_copy(mac
, &rarp
->arp_data
.arp_tha
);
764 memset(&rarp
->arp_data
.arp_sip
, 0x00, 4);
765 memset(&rarp
->arp_data
.arp_tip
, 0x00, 4);
767 rarp_mbuf
->pkt_len
= rarp_mbuf
->data_len
= RARP_PKT_SIZE
;
772 static inline void __attribute__((always_inline
))
773 put_zmbuf(struct zcopy_mbuf
*zmbuf
)
778 static inline int __attribute__((always_inline
))
779 copy_desc_to_mbuf(struct virtio_net
*dev
, struct vring_desc
*descs
,
780 uint16_t max_desc
, struct rte_mbuf
*m
, uint16_t desc_idx
,
781 struct rte_mempool
*mbuf_pool
)
783 struct vring_desc
*desc
;
785 uint32_t desc_avail
, desc_offset
;
786 uint32_t mbuf_avail
, mbuf_offset
;
788 struct rte_mbuf
*cur
= m
, *prev
= m
;
789 struct virtio_net_hdr
*hdr
= NULL
;
790 /* A counter to avoid desc dead loop chain */
791 uint32_t nr_desc
= 1;
793 desc
= &descs
[desc_idx
];
794 if (unlikely((desc
->len
< dev
->vhost_hlen
)) ||
795 (desc
->flags
& VRING_DESC_F_INDIRECT
))
798 desc_addr
= gpa_to_vva(dev
, desc
->addr
);
799 if (unlikely(!desc_addr
))
802 if (virtio_net_with_host_offload(dev
)) {
803 hdr
= (struct virtio_net_hdr
*)((uintptr_t)desc_addr
);
808 * A virtio driver normally uses at least 2 desc buffers
809 * for Tx: the first for storing the header, and others
810 * for storing the data.
812 if (likely((desc
->len
== dev
->vhost_hlen
) &&
813 (desc
->flags
& VRING_DESC_F_NEXT
) != 0)) {
814 desc
= &descs
[desc
->next
];
815 if (unlikely(desc
->flags
& VRING_DESC_F_INDIRECT
))
818 desc_addr
= gpa_to_vva(dev
, desc
->addr
);
819 if (unlikely(!desc_addr
))
823 desc_avail
= desc
->len
;
826 desc_avail
= desc
->len
- dev
->vhost_hlen
;
827 desc_offset
= dev
->vhost_hlen
;
830 rte_prefetch0((void *)(uintptr_t)(desc_addr
+ desc_offset
));
832 PRINT_PACKET(dev
, (uintptr_t)(desc_addr
+ desc_offset
), desc_avail
, 0);
835 mbuf_avail
= m
->buf_len
- RTE_PKTMBUF_HEADROOM
;
839 cpy_len
= RTE_MIN(desc_avail
, mbuf_avail
);
842 * A desc buf might across two host physical pages that are
843 * not continuous. In such case (gpa_to_hpa returns 0), data
844 * will be copied even though zero copy is enabled.
846 if (unlikely(dev
->dequeue_zero_copy
&& (hpa
= gpa_to_hpa(dev
,
847 desc
->addr
+ desc_offset
, cpy_len
)))) {
848 cur
->data_len
= cpy_len
;
850 cur
->buf_addr
= (void *)(uintptr_t)desc_addr
;
851 cur
->buf_physaddr
= hpa
;
854 * In zero copy mode, one mbuf can only reference data
855 * for one or partial of one desc buff.
857 mbuf_avail
= cpy_len
;
859 rte_memcpy(rte_pktmbuf_mtod_offset(cur
, void *,
861 (void *)((uintptr_t)(desc_addr
+ desc_offset
)),
865 mbuf_avail
-= cpy_len
;
866 mbuf_offset
+= cpy_len
;
867 desc_avail
-= cpy_len
;
868 desc_offset
+= cpy_len
;
870 /* This desc reaches to its end, get the next one */
871 if (desc_avail
== 0) {
872 if ((desc
->flags
& VRING_DESC_F_NEXT
) == 0)
875 if (unlikely(desc
->next
>= max_desc
||
876 ++nr_desc
> max_desc
))
878 desc
= &descs
[desc
->next
];
879 if (unlikely(desc
->flags
& VRING_DESC_F_INDIRECT
))
882 desc_addr
= gpa_to_vva(dev
, desc
->addr
);
883 if (unlikely(!desc_addr
))
886 rte_prefetch0((void *)(uintptr_t)desc_addr
);
889 desc_avail
= desc
->len
;
891 PRINT_PACKET(dev
, (uintptr_t)desc_addr
, desc
->len
, 0);
895 * This mbuf reaches to its end, get a new one
898 if (mbuf_avail
== 0) {
899 cur
= rte_pktmbuf_alloc(mbuf_pool
);
900 if (unlikely(cur
== NULL
)) {
901 RTE_LOG(ERR
, VHOST_DATA
, "Failed to "
902 "allocate memory for mbuf.\n");
907 prev
->data_len
= mbuf_offset
;
909 m
->pkt_len
+= mbuf_offset
;
913 mbuf_avail
= cur
->buf_len
- RTE_PKTMBUF_HEADROOM
;
917 prev
->data_len
= mbuf_offset
;
918 m
->pkt_len
+= mbuf_offset
;
921 vhost_dequeue_offload(hdr
, m
);
926 static inline void __attribute__((always_inline
))
927 update_used_ring(struct virtio_net
*dev
, struct vhost_virtqueue
*vq
,
928 uint32_t used_idx
, uint32_t desc_idx
)
930 vq
->used
->ring
[used_idx
].id
= desc_idx
;
931 vq
->used
->ring
[used_idx
].len
= 0;
932 vhost_log_used_vring(dev
, vq
,
933 offsetof(struct vring_used
, ring
[used_idx
]),
934 sizeof(vq
->used
->ring
[used_idx
]));
937 static inline void __attribute__((always_inline
))
938 update_used_idx(struct virtio_net
*dev
, struct vhost_virtqueue
*vq
,
941 if (unlikely(count
== 0))
947 vq
->used
->idx
+= count
;
948 vhost_log_used_vring(dev
, vq
, offsetof(struct vring_used
, idx
),
949 sizeof(vq
->used
->idx
));
951 /* Kick guest if required. */
952 if (!(vq
->avail
->flags
& VRING_AVAIL_F_NO_INTERRUPT
)
953 && (vq
->callfd
>= 0))
954 eventfd_write(vq
->callfd
, (eventfd_t
)1);
957 static inline struct zcopy_mbuf
*__attribute__((always_inline
))
958 get_zmbuf(struct vhost_virtqueue
*vq
)
964 /* search [last_zmbuf_idx, zmbuf_size) */
965 i
= vq
->last_zmbuf_idx
;
966 last
= vq
->zmbuf_size
;
969 for (; i
< last
; i
++) {
970 if (vq
->zmbufs
[i
].in_use
== 0) {
971 vq
->last_zmbuf_idx
= i
+ 1;
972 vq
->zmbufs
[i
].in_use
= 1;
973 return &vq
->zmbufs
[i
];
979 /* search [0, last_zmbuf_idx) */
981 last
= vq
->last_zmbuf_idx
;
988 static inline bool __attribute__((always_inline
))
989 mbuf_is_consumed(struct rte_mbuf
*m
)
992 if (rte_mbuf_refcnt_read(m
) > 1)
1001 rte_vhost_dequeue_burst(int vid
, uint16_t queue_id
,
1002 struct rte_mempool
*mbuf_pool
, struct rte_mbuf
**pkts
, uint16_t count
)
1004 struct virtio_net
*dev
;
1005 struct rte_mbuf
*rarp_mbuf
= NULL
;
1006 struct vhost_virtqueue
*vq
;
1007 uint32_t desc_indexes
[MAX_PKT_BURST
];
1010 uint16_t free_entries
;
1013 dev
= get_device(vid
);
1017 if (unlikely(!is_valid_virt_queue_idx(queue_id
, 1, dev
->virt_qp_nb
))) {
1018 RTE_LOG(ERR
, VHOST_DATA
, "(%d) %s: invalid virtqueue idx %d.\n",
1019 dev
->vid
, __func__
, queue_id
);
1023 vq
= dev
->virtqueue
[queue_id
];
1024 if (unlikely(vq
->enabled
== 0))
1027 if (unlikely(dev
->dequeue_zero_copy
)) {
1028 struct zcopy_mbuf
*zmbuf
, *next
;
1031 for (zmbuf
= TAILQ_FIRST(&vq
->zmbuf_list
);
1032 zmbuf
!= NULL
; zmbuf
= next
) {
1033 next
= TAILQ_NEXT(zmbuf
, next
);
1035 if (mbuf_is_consumed(zmbuf
->mbuf
)) {
1036 used_idx
= vq
->last_used_idx
++ & (vq
->size
- 1);
1037 update_used_ring(dev
, vq
, used_idx
,
1041 TAILQ_REMOVE(&vq
->zmbuf_list
, zmbuf
, next
);
1042 rte_pktmbuf_free(zmbuf
->mbuf
);
1048 update_used_idx(dev
, vq
, nr_updated
);
1052 * Construct a RARP broadcast packet, and inject it to the "pkts"
1053 * array, to looks like that guest actually send such packet.
1055 * Check user_send_rarp() for more information.
1057 if (unlikely(rte_atomic16_cmpset((volatile uint16_t *)
1058 &dev
->broadcast_rarp
.cnt
, 1, 0))) {
1059 rarp_mbuf
= rte_pktmbuf_alloc(mbuf_pool
);
1060 if (rarp_mbuf
== NULL
) {
1061 RTE_LOG(ERR
, VHOST_DATA
,
1062 "Failed to allocate memory for mbuf.\n");
1066 if (make_rarp_packet(rarp_mbuf
, &dev
->mac
)) {
1067 rte_pktmbuf_free(rarp_mbuf
);
1074 free_entries
= *((volatile uint16_t *)&vq
->avail
->idx
) -
1076 if (free_entries
== 0)
1079 LOG_DEBUG(VHOST_DATA
, "(%d) %s\n", dev
->vid
, __func__
);
1081 /* Prefetch available and used ring */
1082 avail_idx
= vq
->last_avail_idx
& (vq
->size
- 1);
1083 used_idx
= vq
->last_used_idx
& (vq
->size
- 1);
1084 rte_prefetch0(&vq
->avail
->ring
[avail_idx
]);
1085 rte_prefetch0(&vq
->used
->ring
[used_idx
]);
1087 count
= RTE_MIN(count
, MAX_PKT_BURST
);
1088 count
= RTE_MIN(count
, free_entries
);
1089 LOG_DEBUG(VHOST_DATA
, "(%d) about to dequeue %u buffers\n",
1092 /* Retrieve all of the head indexes first to avoid caching issues. */
1093 for (i
= 0; i
< count
; i
++) {
1094 avail_idx
= (vq
->last_avail_idx
+ i
) & (vq
->size
- 1);
1095 used_idx
= (vq
->last_used_idx
+ i
) & (vq
->size
- 1);
1096 desc_indexes
[i
] = vq
->avail
->ring
[avail_idx
];
1098 if (likely(dev
->dequeue_zero_copy
== 0))
1099 update_used_ring(dev
, vq
, used_idx
, desc_indexes
[i
]);
1102 /* Prefetch descriptor index. */
1103 rte_prefetch0(&vq
->desc
[desc_indexes
[0]]);
1104 for (i
= 0; i
< count
; i
++) {
1105 struct vring_desc
*desc
;
1109 if (likely(i
+ 1 < count
))
1110 rte_prefetch0(&vq
->desc
[desc_indexes
[i
+ 1]]);
1112 if (vq
->desc
[desc_indexes
[i
]].flags
& VRING_DESC_F_INDIRECT
) {
1113 desc
= (struct vring_desc
*)(uintptr_t)gpa_to_vva(dev
,
1114 vq
->desc
[desc_indexes
[i
]].addr
);
1115 if (unlikely(!desc
))
1118 rte_prefetch0(desc
);
1119 sz
= vq
->desc
[desc_indexes
[i
]].len
/ sizeof(*desc
);
1124 idx
= desc_indexes
[i
];
1127 pkts
[i
] = rte_pktmbuf_alloc(mbuf_pool
);
1128 if (unlikely(pkts
[i
] == NULL
)) {
1129 RTE_LOG(ERR
, VHOST_DATA
,
1130 "Failed to allocate memory for mbuf.\n");
1134 err
= copy_desc_to_mbuf(dev
, desc
, sz
, pkts
[i
], idx
, mbuf_pool
);
1135 if (unlikely(err
)) {
1136 rte_pktmbuf_free(pkts
[i
]);
1140 if (unlikely(dev
->dequeue_zero_copy
)) {
1141 struct zcopy_mbuf
*zmbuf
;
1143 zmbuf
= get_zmbuf(vq
);
1145 rte_pktmbuf_free(pkts
[i
]);
1148 zmbuf
->mbuf
= pkts
[i
];
1149 zmbuf
->desc_idx
= desc_indexes
[i
];
1152 * Pin lock the mbuf; we will check later to see
1153 * whether the mbuf is freed (when we are the last
1154 * user) or not. If that's the case, we then could
1155 * update the used ring safely.
1157 rte_mbuf_refcnt_update(pkts
[i
], 1);
1160 TAILQ_INSERT_TAIL(&vq
->zmbuf_list
, zmbuf
, next
);
1163 vq
->last_avail_idx
+= i
;
1165 if (likely(dev
->dequeue_zero_copy
== 0)) {
1166 vq
->last_used_idx
+= i
;
1167 update_used_idx(dev
, vq
, i
);
1171 if (unlikely(rarp_mbuf
!= NULL
)) {
1173 * Inject it to the head of "pkts" array, so that switch's mac
1174 * learning table will get updated first.
1176 memmove(&pkts
[1], pkts
, i
* sizeof(struct rte_mbuf
*));
1177 pkts
[0] = rarp_mbuf
;