From 1597fc8bf79775ed80f7bd7032a3dd89083efe6c Mon Sep 17 00:00:00 2001 From: Wolfgang Bumiller Date: Thu, 13 Oct 2016 16:49:30 +0200 Subject: [PATCH] Fix #927: add IPoIB performance regression fix Fixes kernel bug #111921 --- ...e-IB-LL-address-into-the-hard-header.patch | 365 ++++++++++++++++++ Makefile | 2 + 2 files changed, 367 insertions(+) create mode 100644 IB-ipoib-move-back-the-IB-LL-address-into-the-hard-header.patch diff --git a/IB-ipoib-move-back-the-IB-LL-address-into-the-hard-header.patch b/IB-ipoib-move-back-the-IB-LL-address-into-the-hard-header.patch new file mode 100644 index 0000000..5b58edd --- /dev/null +++ b/IB-ipoib-move-back-the-IB-LL-address-into-the-hard-header.patch @@ -0,0 +1,365 @@ +From patchwork Wed Oct 12 14:30:30 2016 +Content-Type: text/plain; charset="utf-8" +MIME-Version: 1.0 +Content-Transfer-Encoding: 7bit +Subject: [v2] IB/ipoib: move back IB LL address into the hard header +From: Paolo Abeni +X-Patchwork-Id: 681344 +X-Patchwork-Delegate: davem@davemloft.net +Message-Id: <60efcf739ce3d45a01a7127dbaf7fe366e5ddce4.1476264804.git.pabeni@redhat.com> +To: linux-rdma@vger.kernel.org +Cc: Doug Ledford , Sean Hefty , + Hal Rosenstock , + Jason Gunthorpe , netdev@vger.kernel.org +Date: Wed, 12 Oct 2016 16:30:30 +0200 + +After the commit 9207f9d45b0a ("net: preserve IP control block +during GSO segmentation"), the GSO CB and the IPoIB CB conflict. +That destroy the IPoIB address information cached there, +causing a severe performance regression, as better described here: + +http://marc.info/?l=linux-kernel&m=146787279825501&w=2 + +This change moves the data cached by the IPoIB driver from the +skb control lock into the IPoIB hard header, as done before +the commit 936d7de3d736 ("IPoIB: Stop lying about hard_header_len +and use skb->cb to stash LL addresses"). +In order to avoid GRO issue, on packet reception, the IPoIB driver +stash into the skb a dummy pseudo header, so that the received +packets have actually a hard header matching the declared length. +To avoid changing the connected mode maximum mtu, the allocated +head buffer size is increased by the pseudo header length. + +After this commit, IPoIB performances are back to pre-regression +value. + +v1 -> v2: avoid changing the max mtu, increasing the head buf size + +Fixes: 9207f9d45b0a ("net: preserve IP control block during GSO segmentation") +Signed-off-by: Paolo Abeni +--- + drivers/infiniband/ulp/ipoib/ipoib.h | 20 +++++++--- + drivers/infiniband/ulp/ipoib/ipoib_cm.c | 15 +++---- + drivers/infiniband/ulp/ipoib/ipoib_ib.c | 12 +++--- + drivers/infiniband/ulp/ipoib/ipoib_main.c | 54 ++++++++++++++++---------- + drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 6 ++- + 5 files changed, 64 insertions(+), 43 deletions(-) + +diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h +index 9dbfcc0..5ff64af 100644 +--- a/drivers/infiniband/ulp/ipoib/ipoib.h ++++ b/drivers/infiniband/ulp/ipoib/ipoib.h +@@ -63,6 +63,8 @@ enum ipoib_flush_level { + + enum { + IPOIB_ENCAP_LEN = 4, ++ IPOIB_PSEUDO_LEN = 20, ++ IPOIB_HARD_LEN = IPOIB_ENCAP_LEN + IPOIB_PSEUDO_LEN, + + IPOIB_UD_HEAD_SIZE = IB_GRH_BYTES + IPOIB_ENCAP_LEN, + IPOIB_UD_RX_SG = 2, /* max buffer needed for 4K mtu */ +@@ -134,15 +136,21 @@ struct ipoib_header { + u16 reserved; + }; + +-struct ipoib_cb { +- struct qdisc_skb_cb qdisc_cb; +- u8 hwaddr[INFINIBAND_ALEN]; ++struct ipoib_pseudo_header { ++ u8 hwaddr[INFINIBAND_ALEN]; + }; + +-static inline struct ipoib_cb *ipoib_skb_cb(const struct sk_buff *skb) ++static inline void skb_add_pseudo_hdr(struct sk_buff *skb) + { +- BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct ipoib_cb)); +- return (struct ipoib_cb *)skb->cb; ++ char *data = skb_push(skb, IPOIB_PSEUDO_LEN); ++ ++ /* ++ * only the ipoib header is present now, make room for a dummy ++ * pseudo header and set skb field accordingly ++ */ ++ memset(data, 0, IPOIB_PSEUDO_LEN); ++ skb_reset_mac_header(skb); ++ skb_pull(skb, IPOIB_HARD_LEN); + } + + /* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */ +diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c +index 4ad297d..339a1ee 100644 +--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c ++++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c +@@ -63,6 +63,8 @@ MODULE_PARM_DESC(cm_data_debug_level, + #define IPOIB_CM_RX_DELAY (3 * 256 * HZ) + #define IPOIB_CM_RX_UPDATE_MASK (0x3) + ++#define IPOIB_CM_RX_RESERVE (ALIGN(IPOIB_HARD_LEN, 16) - IPOIB_ENCAP_LEN) ++ + static struct ib_qp_attr ipoib_cm_err_attr = { + .qp_state = IB_QPS_ERR + }; +@@ -146,15 +148,15 @@ static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, + struct sk_buff *skb; + int i; + +- skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12); ++ skb = dev_alloc_skb(ALIGN(IPOIB_CM_HEAD_SIZE + IPOIB_PSEUDO_LEN, 16)); + if (unlikely(!skb)) + return NULL; + + /* +- * IPoIB adds a 4 byte header. So we need 12 more bytes to align the ++ * IPoIB adds a IPOIB_ENCAP_LEN byte header, this will align the + * IP header to a multiple of 16. + */ +- skb_reserve(skb, 12); ++ skb_reserve(skb, IPOIB_CM_RX_RESERVE); + + mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE, + DMA_FROM_DEVICE); +@@ -624,9 +626,9 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) + if (wc->byte_len < IPOIB_CM_COPYBREAK) { + int dlen = wc->byte_len; + +- small_skb = dev_alloc_skb(dlen + 12); ++ small_skb = dev_alloc_skb(dlen + IPOIB_CM_RX_RESERVE); + if (small_skb) { +- skb_reserve(small_skb, 12); ++ skb_reserve(small_skb, IPOIB_CM_RX_RESERVE); + ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0], + dlen, DMA_FROM_DEVICE); + skb_copy_from_linear_data(skb, small_skb->data, dlen); +@@ -663,8 +665,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) + + copied: + skb->protocol = ((struct ipoib_header *) skb->data)->proto; +- skb_reset_mac_header(skb); +- skb_pull(skb, IPOIB_ENCAP_LEN); ++ skb_add_pseudo_hdr(skb); + + ++dev->stats.rx_packets; + dev->stats.rx_bytes += skb->len; +diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c +index be11d5d..830fecb 100644 +--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c ++++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c +@@ -128,16 +128,15 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id) + + buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); + +- skb = dev_alloc_skb(buf_size + IPOIB_ENCAP_LEN); ++ skb = dev_alloc_skb(buf_size + IPOIB_HARD_LEN); + if (unlikely(!skb)) + return NULL; + + /* +- * IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte +- * header. So we need 4 more bytes to get to 48 and align the +- * IP header to a multiple of 16. ++ * the IP header will be at IPOIP_HARD_LEN + IB_GRH_BYTES, that is ++ * 64 bytes aligned + */ +- skb_reserve(skb, 4); ++ skb_reserve(skb, sizeof(struct ipoib_pseudo_header)); + + mapping = priv->rx_ring[id].mapping; + mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size, +@@ -253,8 +252,7 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) + skb_pull(skb, IB_GRH_BYTES); + + skb->protocol = ((struct ipoib_header *) skb->data)->proto; +- skb_reset_mac_header(skb); +- skb_pull(skb, IPOIB_ENCAP_LEN); ++ skb_add_pseudo_hdr(skb); + + ++dev->stats.rx_packets; + dev->stats.rx_bytes += skb->len; +diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c +index cc1c1b0..823a528 100644 +--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c ++++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c +@@ -925,9 +925,12 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr, + ipoib_neigh_free(neigh); + goto err_drop; + } +- if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) ++ if (skb_queue_len(&neigh->queue) < ++ IPOIB_MAX_PATH_REC_QUEUE) { ++ /* put pseudoheader back on for next time */ ++ skb_push(skb, IPOIB_PSEUDO_LEN); + __skb_queue_tail(&neigh->queue, skb); +- else { ++ } else { + ipoib_warn(priv, "queue length limit %d. Packet drop.\n", + skb_queue_len(&neigh->queue)); + goto err_drop; +@@ -964,7 +967,7 @@ err_drop: + } + + static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, +- struct ipoib_cb *cb) ++ struct ipoib_pseudo_header *phdr) + { + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_path *path; +@@ -972,16 +975,18 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, + + spin_lock_irqsave(&priv->lock, flags); + +- path = __path_find(dev, cb->hwaddr + 4); ++ path = __path_find(dev, phdr->hwaddr + 4); + if (!path || !path->valid) { + int new_path = 0; + + if (!path) { +- path = path_rec_create(dev, cb->hwaddr + 4); ++ path = path_rec_create(dev, phdr->hwaddr + 4); + new_path = 1; + } + if (path) { + if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { ++ /* put pseudoheader back on for next time */ ++ skb_push(skb, IPOIB_PSEUDO_LEN); + __skb_queue_tail(&path->queue, skb); + } else { + ++dev->stats.tx_dropped; +@@ -1009,10 +1014,12 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, + be16_to_cpu(path->pathrec.dlid)); + + spin_unlock_irqrestore(&priv->lock, flags); +- ipoib_send(dev, skb, path->ah, IPOIB_QPN(cb->hwaddr)); ++ ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr)); + return; + } else if ((path->query || !path_rec_start(dev, path)) && + skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { ++ /* put pseudoheader back on for next time */ ++ skb_push(skb, IPOIB_PSEUDO_LEN); + __skb_queue_tail(&path->queue, skb); + } else { + ++dev->stats.tx_dropped; +@@ -1026,13 +1033,15 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) + { + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_neigh *neigh; +- struct ipoib_cb *cb = ipoib_skb_cb(skb); ++ struct ipoib_pseudo_header *phdr; + struct ipoib_header *header; + unsigned long flags; + ++ phdr = (struct ipoib_pseudo_header *) skb->data; ++ skb_pull(skb, sizeof(*phdr)); + header = (struct ipoib_header *) skb->data; + +- if (unlikely(cb->hwaddr[4] == 0xff)) { ++ if (unlikely(phdr->hwaddr[4] == 0xff)) { + /* multicast, arrange "if" according to probability */ + if ((header->proto != htons(ETH_P_IP)) && + (header->proto != htons(ETH_P_IPV6)) && +@@ -1045,13 +1054,13 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) + return NETDEV_TX_OK; + } + /* Add in the P_Key for multicast*/ +- cb->hwaddr[8] = (priv->pkey >> 8) & 0xff; +- cb->hwaddr[9] = priv->pkey & 0xff; ++ phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff; ++ phdr->hwaddr[9] = priv->pkey & 0xff; + +- neigh = ipoib_neigh_get(dev, cb->hwaddr); ++ neigh = ipoib_neigh_get(dev, phdr->hwaddr); + if (likely(neigh)) + goto send_using_neigh; +- ipoib_mcast_send(dev, cb->hwaddr, skb); ++ ipoib_mcast_send(dev, phdr->hwaddr, skb); + return NETDEV_TX_OK; + } + +@@ -1060,16 +1069,16 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) + case htons(ETH_P_IP): + case htons(ETH_P_IPV6): + case htons(ETH_P_TIPC): +- neigh = ipoib_neigh_get(dev, cb->hwaddr); ++ neigh = ipoib_neigh_get(dev, phdr->hwaddr); + if (unlikely(!neigh)) { +- neigh_add_path(skb, cb->hwaddr, dev); ++ neigh_add_path(skb, phdr->hwaddr, dev); + return NETDEV_TX_OK; + } + break; + case htons(ETH_P_ARP): + case htons(ETH_P_RARP): + /* for unicast ARP and RARP should always perform path find */ +- unicast_arp_send(skb, dev, cb); ++ unicast_arp_send(skb, dev, phdr); + return NETDEV_TX_OK; + default: + /* ethertype not supported by IPoIB */ +@@ -1086,11 +1095,13 @@ send_using_neigh: + goto unref; + } + } else if (neigh->ah) { +- ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr)); ++ ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(phdr->hwaddr)); + goto unref; + } + + if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { ++ /* put pseudoheader back on for next time */ ++ skb_push(skb, sizeof(*phdr)); + spin_lock_irqsave(&priv->lock, flags); + __skb_queue_tail(&neigh->queue, skb); + spin_unlock_irqrestore(&priv->lock, flags); +@@ -1122,8 +1133,8 @@ static int ipoib_hard_header(struct sk_buff *skb, + unsigned short type, + const void *daddr, const void *saddr, unsigned len) + { ++ struct ipoib_pseudo_header *phdr; + struct ipoib_header *header; +- struct ipoib_cb *cb = ipoib_skb_cb(skb); + + header = (struct ipoib_header *) skb_push(skb, sizeof *header); + +@@ -1132,12 +1143,13 @@ static int ipoib_hard_header(struct sk_buff *skb, + + /* + * we don't rely on dst_entry structure, always stuff the +- * destination address into skb->cb so we can figure out where ++ * destination address into skb hard header so we can figure out where + * to send the packet later. + */ +- memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN); ++ phdr = (struct ipoib_pseudo_header *) skb_push(skb, sizeof(*phdr)); ++ memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN); + +- return sizeof *header; ++ return IPOIB_HARD_LEN; + } + + static void ipoib_set_mcast_list(struct net_device *dev) +@@ -1759,7 +1771,7 @@ void ipoib_setup(struct net_device *dev) + + dev->flags |= IFF_BROADCAST | IFF_MULTICAST; + +- dev->hard_header_len = IPOIB_ENCAP_LEN; ++ dev->hard_header_len = IPOIB_HARD_LEN; + dev->addr_len = INFINIBAND_ALEN; + dev->type = ARPHRD_INFINIBAND; + dev->tx_queue_len = ipoib_sendq_size * 2; +diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +index d3394b6..1909dd2 100644 +--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c ++++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +@@ -796,9 +796,11 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb) + __ipoib_mcast_add(dev, mcast); + list_add_tail(&mcast->list, &priv->multicast_list); + } +- if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE) ++ if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE) { ++ /* put pseudoheader back on for next time */ ++ skb_push(skb, sizeof(struct ipoib_pseudo_header)); + skb_queue_tail(&mcast->pkt_queue, skb); +- else { ++ } else { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + } diff --git a/Makefile b/Makefile index 4055d41..6a608c5 100644 --- a/Makefile +++ b/Makefile @@ -267,6 +267,8 @@ ${KERNEL_SRC}/README ${KERNEL_CFG_ORG}: ${KERNELSRCTAR} cd ${KERNEL_SRC}; patch -p1 < ../mei_bus-whitelist-watchdog-client.patch #sd: Fix rw_max for devices that report an optimal xfer size cd ${KERNEL_SRC}; patch -p1 < ../sd-fix-rw_max.patch + # IPoIB performance regression fix + cd ${KERNEL_SRC}; patch -p1 < ../IB-ipoib-move-back-the-IB-LL-address-into-the-hard-header.patch sed -i ${KERNEL_SRC}/Makefile -e 's/^EXTRAVERSION.*$$/EXTRAVERSION=${EXTRAVERSION}/' touch $@ -- 2.39.2