From dfaf00e8c3ea576e103bbb9e7f0736630df0e330 Mon Sep 17 00:00:00 2001 From: Mark Kavanagh Date: Fri, 2 Nov 2018 09:06:32 +0000 Subject: [PATCH] netdev-dpdk: fix mbuf sizing There are numerous factors that must be considered when calculating the size of an mbuf: - the data portion of the mbuf must be sized in accordance With Rx buffer alignment (typically 1024B). So, for example, in order to successfully receive and capture a 1500B packet, mbufs with a data portion of size 2048B must be used. - in OvS, the elements that comprise an mbuf are: * the dp packet, which includes a struct rte mbuf (704B) * RTE_PKTMBUF_HEADROOM (128B) * packet data (aligned to 1k, as previously described) * RTE_PKTMBUF_TAILROOM (typically 0) Some PMDs require that the total mbuf size (i.e. the total sum of all of the above-listed components' lengths) is cache-aligned. To satisfy this requirement, it may be necessary to round up the total mbuf size with respect to cacheline size. In doing so, it's possible that the dp_packet's data portion is inadvertently increased in size, such that it no longer adheres to Rx buffer alignment. Consequently, the following property of the mbuf no longer holds true: mbuf.data_len == mbuf.buf_len - mbuf.data_off This creates a problem in the case of multi-segment mbufs, where that assumption is assumed to be true for all but the final segment in an mbuf chain. Resolve this issue by adjusting the size of the mbuf's private data portion, as opposed to the packet data portion when aligning mbuf size to cachelines. Co-authored-by: Tiago Lam Fixes: 4be4d22 ("netdev-dpdk: clean up mbuf initialization") Fixes: 31b88c9 ("netdev-dpdk: round up mbuf_size to cache_line_size") CC: Santosh Shukla Signed-off-by: Mark Kavanagh Signed-off-by: Tiago Lam Acked-by: Santosh Shukla Acked-by: Eelco Chaudron Signed-off-by: Ian Stokes --- Documentation/topics/dpdk/memory.rst | 28 +++++++------- lib/netdev-dpdk.c | 55 +++++++++++++++++++--------- 2 files changed, 51 insertions(+), 32 deletions(-) diff --git a/Documentation/topics/dpdk/memory.rst b/Documentation/topics/dpdk/memory.rst index e5fb166d5..c9b739fb7 100644 --- a/Documentation/topics/dpdk/memory.rst +++ b/Documentation/topics/dpdk/memory.rst @@ -107,8 +107,8 @@ Example 1 MTU = 1500 Bytes Number of mbufs = 262144 - Mbuf size = 3008 Bytes - Memory required = 262144 * 3008 = 788 MB + Mbuf size = 2752 Bytes + Memory required = 262144 * 2752 = 721 MB Example 2 +++++++++ @@ -116,8 +116,8 @@ Example 2 MTU = 1800 Bytes Number of mbufs = 262144 - Mbuf size = 3008 Bytes - Memory required = 262144 * 3008 = 788 MB + Mbuf size = 2752 Bytes + Memory required = 262144 * 2752 = 721 MB .. note:: @@ -130,8 +130,8 @@ Example 3 MTU = 6000 Bytes Number of mbufs = 262144 - Mbuf size = 8128 Bytes - Memory required = 262144 * 8128 = 2130 MB + Mbuf size = 8000 Bytes + Memory required = 262144 * 8000 = 2097 MB Example 4 +++++++++ @@ -139,8 +139,8 @@ Example 4 MTU = 9000 Bytes Number of mbufs = 262144 - Mbuf size = 10176 Bytes - Memory required = 262144 * 10176 = 2667 MB + Mbuf size = 10048 Bytes + Memory required = 262144 * 10048 = 2634 MB Per Port Memory Calculations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -194,8 +194,8 @@ Example 1: (1 rxq, 1 PMD, 1500 MTU) MTU = 1500 Number of mbufs = (1 * 2048) + (2 * 2048) + (1 * 32) + (16384) = 22560 - Mbuf size = 3008 Bytes - Memory required = 22560 * 3008 = 67 MB + Mbuf size = 2752 Bytes + Memory required = 22560 * 2752 = 62 MB Example 2: (1 rxq, 2 PMD, 6000 MTU) +++++++++++++++++++++++++++++++++++ @@ -203,8 +203,8 @@ Example 2: (1 rxq, 2 PMD, 6000 MTU) MTU = 6000 Number of mbufs = (1 * 2048) + (3 * 2048) + (1 * 32) + (16384) = 24608 - Mbuf size = 8128 Bytes - Memory required = 24608 * 8128 = 200 MB + Mbuf size = 8000 Bytes + Memory required = 24608 * 8000 = 196 MB Example 3: (2 rxq, 2 PMD, 9000 MTU) +++++++++++++++++++++++++++++++++++ @@ -212,5 +212,5 @@ Example 3: (2 rxq, 2 PMD, 9000 MTU) MTU = 9000 Number of mbufs = (2 * 2048) + (3 * 2048) + (1 * 32) + (16384) = 26656 - Mbuf size = 10176 Bytes - Memory required = 26656 * 10176 = 271 MB + Mbuf size = 10048 Bytes + Memory required = 26656 * 10048 = 267 MB diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 6bc11b318..3f28bce4e 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -88,10 +88,6 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20); #define MTU_TO_MAX_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_MAX_LEN) #define FRAME_LEN_TO_MTU(frame_len) ((frame_len) \ - ETHER_HDR_LEN - ETHER_CRC_LEN) -#define MBUF_SIZE(mtu) ROUND_UP((MTU_TO_MAX_FRAME_LEN(mtu) \ - + sizeof(struct dp_packet) \ - + RTE_PKTMBUF_HEADROOM), \ - RTE_CACHE_LINE_SIZE) #define NETDEV_DPDK_MBUF_ALIGN 1024 #define NETDEV_DPDK_MAX_PKT_LEN 9728 @@ -637,7 +633,11 @@ dpdk_mp_create(struct netdev_dpdk *dev, int mtu, bool per_port_mp) char mp_name[RTE_MEMPOOL_NAMESIZE]; const char *netdev_name = netdev_get_name(&dev->up); int socket_id = dev->requested_socket_id; - uint32_t n_mbufs; + uint32_t n_mbufs = 0; + uint32_t mbuf_size = 0; + uint32_t aligned_mbuf_size = 0; + uint32_t mbuf_priv_data_len = 0; + uint32_t pkt_size = 0; uint32_t hash = hash_string(netdev_name, 0); struct dpdk_mp *dmp = NULL; int ret; @@ -650,6 +650,9 @@ dpdk_mp_create(struct netdev_dpdk *dev, int mtu, bool per_port_mp) dmp->mtu = mtu; dmp->refcount = 1; + /* Get the size of each mbuf, based on the MTU */ + mbuf_size = MTU_TO_FRAME_LEN(mtu); + n_mbufs = dpdk_calculate_mbufs(dev, mtu, per_port_mp); do { @@ -661,8 +664,8 @@ dpdk_mp_create(struct netdev_dpdk *dev, int mtu, bool per_port_mp) * so this is not an issue for tasks such as debugging. */ ret = snprintf(mp_name, RTE_MEMPOOL_NAMESIZE, - "ovs%08x%02d%05d%07u", - hash, socket_id, mtu, n_mbufs); + "ovs%08x%02d%05d%07u", + hash, socket_id, mtu, n_mbufs); if (ret < 0 || ret >= RTE_MEMPOOL_NAMESIZE) { VLOG_DBG("snprintf returned %d. " "Failed to generate a mempool name for \"%s\". " @@ -671,17 +674,33 @@ dpdk_mp_create(struct netdev_dpdk *dev, int mtu, bool per_port_mp) break; } - VLOG_DBG("Port %s: Requesting a mempool of %u mbufs " - "on socket %d for %d Rx and %d Tx queues.", - netdev_name, n_mbufs, socket_id, - dev->requested_n_rxq, dev->requested_n_txq); - - dmp->mp = rte_pktmbuf_pool_create(mp_name, n_mbufs, - MP_CACHE_SZ, - sizeof (struct dp_packet) - - sizeof (struct rte_mbuf), - MBUF_SIZE(mtu) - - sizeof(struct dp_packet), + VLOG_DBG("Port %s: Requesting a mempool of %u mbufs of size %u " + "on socket %d for %d Rx and %d Tx queues, " + "cache line size of %u", + netdev_name, n_mbufs, mbuf_size, socket_id, + dev->requested_n_rxq, dev->requested_n_txq, + RTE_CACHE_LINE_SIZE); + + mbuf_priv_data_len = sizeof(struct dp_packet) - + sizeof(struct rte_mbuf); + /* The size of the entire dp_packet. */ + pkt_size = sizeof(struct dp_packet) + mbuf_size; + /* mbuf size, rounded up to cacheline size. */ + aligned_mbuf_size = ROUND_UP(pkt_size, RTE_CACHE_LINE_SIZE); + /* If there is a size discrepancy, add padding to mbuf_priv_data_len. + * This maintains mbuf size cache alignment, while also honoring RX + * buffer alignment in the data portion of the mbuf. If this adjustment + * is not made, there is a possiblity later on that for an element of + * the mempool, buf, buf->data_len < (buf->buf_len - buf->data_off). + * This is problematic in the case of multi-segment mbufs, particularly + * when an mbuf segment needs to be resized (when [push|popp]ing a VLAN + * header, for example. + */ + mbuf_priv_data_len += (aligned_mbuf_size - pkt_size); + + dmp->mp = rte_pktmbuf_pool_create(mp_name, n_mbufs, MP_CACHE_SZ, + mbuf_priv_data_len, + mbuf_size, socket_id); if (dmp->mp) { -- 2.39.2