1 From patchwork Wed Oct 12 14:30:30 2016
2 Content-Type: text/plain; charset="utf-8"
4 Content-Transfer-Encoding: 7bit
5 Subject: [v2] IB/ipoib: move back IB LL address into the hard header
6 From: Paolo Abeni <pabeni@redhat.com>
8 X-Patchwork-Delegate: davem@davemloft.net
9 Message-Id: <60efcf739ce3d45a01a7127dbaf7fe366e5ddce4.1476264804.git.pabeni@redhat.com>
10 To: linux-rdma@vger.kernel.org
11 Cc: Doug Ledford <dledford@redhat.com>, Sean Hefty <sean.hefty@intel.com>,
12 Hal Rosenstock <hal.rosenstock@gmail.com>,
13 Jason Gunthorpe <jgunthorpe@obsidianresearch.com>, netdev@vger.kernel.org
14 Date: Wed, 12 Oct 2016 16:30:30 +0200
16 After the commit 9207f9d45b0a ("net: preserve IP control block
17 during GSO segmentation"), the GSO CB and the IPoIB CB conflict.
18 That destroy the IPoIB address information cached there,
19 causing a severe performance regression, as better described here:
21 http://marc.info/?l=linux-kernel&m=146787279825501&w=2
23 This change moves the data cached by the IPoIB driver from the
24 skb control lock into the IPoIB hard header, as done before
25 the commit 936d7de3d736 ("IPoIB: Stop lying about hard_header_len
26 and use skb->cb to stash LL addresses").
27 In order to avoid GRO issue, on packet reception, the IPoIB driver
28 stash into the skb a dummy pseudo header, so that the received
29 packets have actually a hard header matching the declared length.
30 To avoid changing the connected mode maximum mtu, the allocated
31 head buffer size is increased by the pseudo header length.
33 After this commit, IPoIB performances are back to pre-regression
36 v1 -> v2: avoid changing the max mtu, increasing the head buf size
38 Fixes: 9207f9d45b0a ("net: preserve IP control block during GSO segmentation")
39 Signed-off-by: Paolo Abeni <pabeni@redhat.com>
41 drivers/infiniband/ulp/ipoib/ipoib.h | 20 +++++++---
42 drivers/infiniband/ulp/ipoib/ipoib_cm.c | 15 +++----
43 drivers/infiniband/ulp/ipoib/ipoib_ib.c | 12 +++---
44 drivers/infiniband/ulp/ipoib/ipoib_main.c | 54 ++++++++++++++++----------
45 drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 6 ++-
46 5 files changed, 64 insertions(+), 43 deletions(-)
48 diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
49 index 9dbfcc0..5ff64af 100644
50 --- a/drivers/infiniband/ulp/ipoib/ipoib.h
51 +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
52 @@ -63,6 +63,8 @@ enum ipoib_flush_level {
56 + IPOIB_PSEUDO_LEN = 20,
57 + IPOIB_HARD_LEN = IPOIB_ENCAP_LEN + IPOIB_PSEUDO_LEN,
59 IPOIB_UD_HEAD_SIZE = IB_GRH_BYTES + IPOIB_ENCAP_LEN,
60 IPOIB_UD_RX_SG = 2, /* max buffer needed for 4K mtu */
61 @@ -134,15 +136,21 @@ struct ipoib_header {
66 - struct qdisc_skb_cb qdisc_cb;
67 - u8 hwaddr[INFINIBAND_ALEN];
68 +struct ipoib_pseudo_header {
69 + u8 hwaddr[INFINIBAND_ALEN];
72 -static inline struct ipoib_cb *ipoib_skb_cb(const struct sk_buff *skb)
73 +static inline void skb_add_pseudo_hdr(struct sk_buff *skb)
75 - BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct ipoib_cb));
76 - return (struct ipoib_cb *)skb->cb;
77 + char *data = skb_push(skb, IPOIB_PSEUDO_LEN);
80 + * only the ipoib header is present now, make room for a dummy
81 + * pseudo header and set skb field accordingly
83 + memset(data, 0, IPOIB_PSEUDO_LEN);
84 + skb_reset_mac_header(skb);
85 + skb_pull(skb, IPOIB_HARD_LEN);
88 /* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */
89 diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
90 index 4ad297d..339a1ee 100644
91 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
92 +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
93 @@ -63,6 +63,8 @@ MODULE_PARM_DESC(cm_data_debug_level,
94 #define IPOIB_CM_RX_DELAY (3 * 256 * HZ)
95 #define IPOIB_CM_RX_UPDATE_MASK (0x3)
97 +#define IPOIB_CM_RX_RESERVE (ALIGN(IPOIB_HARD_LEN, 16) - IPOIB_ENCAP_LEN)
99 static struct ib_qp_attr ipoib_cm_err_attr = {
100 .qp_state = IB_QPS_ERR
102 @@ -146,15 +148,15 @@ static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev,
106 - skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12);
107 + skb = dev_alloc_skb(ALIGN(IPOIB_CM_HEAD_SIZE + IPOIB_PSEUDO_LEN, 16));
112 - * IPoIB adds a 4 byte header. So we need 12 more bytes to align the
113 + * IPoIB adds a IPOIB_ENCAP_LEN byte header, this will align the
114 * IP header to a multiple of 16.
116 - skb_reserve(skb, 12);
117 + skb_reserve(skb, IPOIB_CM_RX_RESERVE);
119 mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE,
121 @@ -624,9 +626,9 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
122 if (wc->byte_len < IPOIB_CM_COPYBREAK) {
123 int dlen = wc->byte_len;
125 - small_skb = dev_alloc_skb(dlen + 12);
126 + small_skb = dev_alloc_skb(dlen + IPOIB_CM_RX_RESERVE);
128 - skb_reserve(small_skb, 12);
129 + skb_reserve(small_skb, IPOIB_CM_RX_RESERVE);
130 ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0],
131 dlen, DMA_FROM_DEVICE);
132 skb_copy_from_linear_data(skb, small_skb->data, dlen);
133 @@ -663,8 +665,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
136 skb->protocol = ((struct ipoib_header *) skb->data)->proto;
137 - skb_reset_mac_header(skb);
138 - skb_pull(skb, IPOIB_ENCAP_LEN);
139 + skb_add_pseudo_hdr(skb);
141 ++dev->stats.rx_packets;
142 dev->stats.rx_bytes += skb->len;
143 diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
144 index be11d5d..830fecb 100644
145 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
146 +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
147 @@ -128,16 +128,15 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id)
149 buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
151 - skb = dev_alloc_skb(buf_size + IPOIB_ENCAP_LEN);
152 + skb = dev_alloc_skb(buf_size + IPOIB_HARD_LEN);
157 - * IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte
158 - * header. So we need 4 more bytes to get to 48 and align the
159 - * IP header to a multiple of 16.
160 + * the IP header will be at IPOIP_HARD_LEN + IB_GRH_BYTES, that is
163 - skb_reserve(skb, 4);
164 + skb_reserve(skb, sizeof(struct ipoib_pseudo_header));
166 mapping = priv->rx_ring[id].mapping;
167 mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size,
168 @@ -253,8 +252,7 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
169 skb_pull(skb, IB_GRH_BYTES);
171 skb->protocol = ((struct ipoib_header *) skb->data)->proto;
172 - skb_reset_mac_header(skb);
173 - skb_pull(skb, IPOIB_ENCAP_LEN);
174 + skb_add_pseudo_hdr(skb);
176 ++dev->stats.rx_packets;
177 dev->stats.rx_bytes += skb->len;
178 diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
179 index cc1c1b0..823a528 100644
180 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
181 +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
182 @@ -925,9 +925,12 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
183 ipoib_neigh_free(neigh);
186 - if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
187 + if (skb_queue_len(&neigh->queue) <
188 + IPOIB_MAX_PATH_REC_QUEUE) {
189 + /* put pseudoheader back on for next time */
190 + skb_push(skb, IPOIB_PSEUDO_LEN);
191 __skb_queue_tail(&neigh->queue, skb);
194 ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
195 skb_queue_len(&neigh->queue));
197 @@ -964,7 +967,7 @@ err_drop:
200 static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
201 - struct ipoib_cb *cb)
202 + struct ipoib_pseudo_header *phdr)
204 struct ipoib_dev_priv *priv = netdev_priv(dev);
205 struct ipoib_path *path;
206 @@ -972,16 +975,18 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
208 spin_lock_irqsave(&priv->lock, flags);
210 - path = __path_find(dev, cb->hwaddr + 4);
211 + path = __path_find(dev, phdr->hwaddr + 4);
212 if (!path || !path->valid) {
216 - path = path_rec_create(dev, cb->hwaddr + 4);
217 + path = path_rec_create(dev, phdr->hwaddr + 4);
221 if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
222 + /* put pseudoheader back on for next time */
223 + skb_push(skb, IPOIB_PSEUDO_LEN);
224 __skb_queue_tail(&path->queue, skb);
226 ++dev->stats.tx_dropped;
227 @@ -1009,10 +1014,12 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
228 be16_to_cpu(path->pathrec.dlid));
230 spin_unlock_irqrestore(&priv->lock, flags);
231 - ipoib_send(dev, skb, path->ah, IPOIB_QPN(cb->hwaddr));
232 + ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr));
234 } else if ((path->query || !path_rec_start(dev, path)) &&
235 skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
236 + /* put pseudoheader back on for next time */
237 + skb_push(skb, IPOIB_PSEUDO_LEN);
238 __skb_queue_tail(&path->queue, skb);
240 ++dev->stats.tx_dropped;
241 @@ -1026,13 +1033,15 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
243 struct ipoib_dev_priv *priv = netdev_priv(dev);
244 struct ipoib_neigh *neigh;
245 - struct ipoib_cb *cb = ipoib_skb_cb(skb);
246 + struct ipoib_pseudo_header *phdr;
247 struct ipoib_header *header;
250 + phdr = (struct ipoib_pseudo_header *) skb->data;
251 + skb_pull(skb, sizeof(*phdr));
252 header = (struct ipoib_header *) skb->data;
254 - if (unlikely(cb->hwaddr[4] == 0xff)) {
255 + if (unlikely(phdr->hwaddr[4] == 0xff)) {
256 /* multicast, arrange "if" according to probability */
257 if ((header->proto != htons(ETH_P_IP)) &&
258 (header->proto != htons(ETH_P_IPV6)) &&
259 @@ -1045,13 +1054,13 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
262 /* Add in the P_Key for multicast*/
263 - cb->hwaddr[8] = (priv->pkey >> 8) & 0xff;
264 - cb->hwaddr[9] = priv->pkey & 0xff;
265 + phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff;
266 + phdr->hwaddr[9] = priv->pkey & 0xff;
268 - neigh = ipoib_neigh_get(dev, cb->hwaddr);
269 + neigh = ipoib_neigh_get(dev, phdr->hwaddr);
271 goto send_using_neigh;
272 - ipoib_mcast_send(dev, cb->hwaddr, skb);
273 + ipoib_mcast_send(dev, phdr->hwaddr, skb);
277 @@ -1060,16 +1069,16 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
278 case htons(ETH_P_IP):
279 case htons(ETH_P_IPV6):
280 case htons(ETH_P_TIPC):
281 - neigh = ipoib_neigh_get(dev, cb->hwaddr);
282 + neigh = ipoib_neigh_get(dev, phdr->hwaddr);
283 if (unlikely(!neigh)) {
284 - neigh_add_path(skb, cb->hwaddr, dev);
285 + neigh_add_path(skb, phdr->hwaddr, dev);
289 case htons(ETH_P_ARP):
290 case htons(ETH_P_RARP):
291 /* for unicast ARP and RARP should always perform path find */
292 - unicast_arp_send(skb, dev, cb);
293 + unicast_arp_send(skb, dev, phdr);
296 /* ethertype not supported by IPoIB */
297 @@ -1086,11 +1095,13 @@ send_using_neigh:
300 } else if (neigh->ah) {
301 - ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr));
302 + ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(phdr->hwaddr));
306 if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
307 + /* put pseudoheader back on for next time */
308 + skb_push(skb, sizeof(*phdr));
309 spin_lock_irqsave(&priv->lock, flags);
310 __skb_queue_tail(&neigh->queue, skb);
311 spin_unlock_irqrestore(&priv->lock, flags);
312 @@ -1122,8 +1133,8 @@ static int ipoib_hard_header(struct sk_buff *skb,
314 const void *daddr, const void *saddr, unsigned len)
316 + struct ipoib_pseudo_header *phdr;
317 struct ipoib_header *header;
318 - struct ipoib_cb *cb = ipoib_skb_cb(skb);
320 header = (struct ipoib_header *) skb_push(skb, sizeof *header);
322 @@ -1132,12 +1143,13 @@ static int ipoib_hard_header(struct sk_buff *skb,
325 * we don't rely on dst_entry structure, always stuff the
326 - * destination address into skb->cb so we can figure out where
327 + * destination address into skb hard header so we can figure out where
328 * to send the packet later.
330 - memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN);
331 + phdr = (struct ipoib_pseudo_header *) skb_push(skb, sizeof(*phdr));
332 + memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN);
334 - return sizeof *header;
335 + return IPOIB_HARD_LEN;
338 static void ipoib_set_mcast_list(struct net_device *dev)
339 @@ -1759,7 +1771,7 @@ void ipoib_setup(struct net_device *dev)
341 dev->flags |= IFF_BROADCAST | IFF_MULTICAST;
343 - dev->hard_header_len = IPOIB_ENCAP_LEN;
344 + dev->hard_header_len = IPOIB_HARD_LEN;
345 dev->addr_len = INFINIBAND_ALEN;
346 dev->type = ARPHRD_INFINIBAND;
347 dev->tx_queue_len = ipoib_sendq_size * 2;
348 diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
349 index d3394b6..1909dd2 100644
350 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
351 +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
352 @@ -796,9 +796,11 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)
353 __ipoib_mcast_add(dev, mcast);
354 list_add_tail(&mcast->list, &priv->multicast_list);
356 - if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE)
357 + if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE) {
358 + /* put pseudoheader back on for next time */
359 + skb_push(skb, sizeof(struct ipoib_pseudo_header));
360 skb_queue_tail(&mcast->pkt_queue, skb);
363 ++dev->stats.tx_dropped;
364 dev_kfree_skb_any(skb);