]> git.proxmox.com Git - pve-kernel-jessie.git/blame - IB-ipoib-move-back-the-IB-LL-address-into-the-hard-header.patch
revert buggy NVME setup commit
[pve-kernel-jessie.git] / IB-ipoib-move-back-the-IB-LL-address-into-the-hard-header.patch
CommitLineData
1597fc8b
WB
1From patchwork Wed Oct 12 14:30:30 2016
2Content-Type: text/plain; charset="utf-8"
3MIME-Version: 1.0
4Content-Transfer-Encoding: 7bit
5Subject: [v2] IB/ipoib: move back IB LL address into the hard header
6From: Paolo Abeni <pabeni@redhat.com>
7X-Patchwork-Id: 681344
8X-Patchwork-Delegate: davem@davemloft.net
9Message-Id: <60efcf739ce3d45a01a7127dbaf7fe366e5ddce4.1476264804.git.pabeni@redhat.com>
10To: linux-rdma@vger.kernel.org
11Cc: Doug Ledford <dledford@redhat.com>, Sean Hefty <sean.hefty@intel.com>,
12 Hal Rosenstock <hal.rosenstock@gmail.com>,
13 Jason Gunthorpe <jgunthorpe@obsidianresearch.com>, netdev@vger.kernel.org
14Date: Wed, 12 Oct 2016 16:30:30 +0200
15
16After the commit 9207f9d45b0a ("net: preserve IP control block
17during GSO segmentation"), the GSO CB and the IPoIB CB conflict.
18That destroy the IPoIB address information cached there,
19causing a severe performance regression, as better described here:
20
21http://marc.info/?l=linux-kernel&m=146787279825501&w=2
22
23This change moves the data cached by the IPoIB driver from the
24skb control lock into the IPoIB hard header, as done before
25the commit 936d7de3d736 ("IPoIB: Stop lying about hard_header_len
26and use skb->cb to stash LL addresses").
27In order to avoid GRO issue, on packet reception, the IPoIB driver
28stash into the skb a dummy pseudo header, so that the received
29packets have actually a hard header matching the declared length.
30To avoid changing the connected mode maximum mtu, the allocated
31head buffer size is increased by the pseudo header length.
32
33After this commit, IPoIB performances are back to pre-regression
34value.
35
36v1 -> v2: avoid changing the max mtu, increasing the head buf size
37
38Fixes: 9207f9d45b0a ("net: preserve IP control block during GSO segmentation")
39Signed-off-by: Paolo Abeni <pabeni@redhat.com>
40---
41 drivers/infiniband/ulp/ipoib/ipoib.h | 20 +++++++---
42 drivers/infiniband/ulp/ipoib/ipoib_cm.c | 15 +++----
43 drivers/infiniband/ulp/ipoib/ipoib_ib.c | 12 +++---
44 drivers/infiniband/ulp/ipoib/ipoib_main.c | 54 ++++++++++++++++----------
45 drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 6 ++-
46 5 files changed, 64 insertions(+), 43 deletions(-)
47
48diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
49index 9dbfcc0..5ff64af 100644
50--- a/drivers/infiniband/ulp/ipoib/ipoib.h
51+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
52@@ -63,6 +63,8 @@ enum ipoib_flush_level {
53
54 enum {
55 IPOIB_ENCAP_LEN = 4,
56+ IPOIB_PSEUDO_LEN = 20,
57+ IPOIB_HARD_LEN = IPOIB_ENCAP_LEN + IPOIB_PSEUDO_LEN,
58
59 IPOIB_UD_HEAD_SIZE = IB_GRH_BYTES + IPOIB_ENCAP_LEN,
60 IPOIB_UD_RX_SG = 2, /* max buffer needed for 4K mtu */
61@@ -134,15 +136,21 @@ struct ipoib_header {
62 u16 reserved;
63 };
64
65-struct ipoib_cb {
66- struct qdisc_skb_cb qdisc_cb;
67- u8 hwaddr[INFINIBAND_ALEN];
68+struct ipoib_pseudo_header {
69+ u8 hwaddr[INFINIBAND_ALEN];
70 };
71
72-static inline struct ipoib_cb *ipoib_skb_cb(const struct sk_buff *skb)
73+static inline void skb_add_pseudo_hdr(struct sk_buff *skb)
74 {
75- BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct ipoib_cb));
76- return (struct ipoib_cb *)skb->cb;
77+ char *data = skb_push(skb, IPOIB_PSEUDO_LEN);
78+
79+ /*
80+ * only the ipoib header is present now, make room for a dummy
81+ * pseudo header and set skb field accordingly
82+ */
83+ memset(data, 0, IPOIB_PSEUDO_LEN);
84+ skb_reset_mac_header(skb);
85+ skb_pull(skb, IPOIB_HARD_LEN);
86 }
87
88 /* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */
89diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
90index 4ad297d..339a1ee 100644
91--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
92+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
93@@ -63,6 +63,8 @@ MODULE_PARM_DESC(cm_data_debug_level,
94 #define IPOIB_CM_RX_DELAY (3 * 256 * HZ)
95 #define IPOIB_CM_RX_UPDATE_MASK (0x3)
96
97+#define IPOIB_CM_RX_RESERVE (ALIGN(IPOIB_HARD_LEN, 16) - IPOIB_ENCAP_LEN)
98+
99 static struct ib_qp_attr ipoib_cm_err_attr = {
100 .qp_state = IB_QPS_ERR
101 };
102@@ -146,15 +148,15 @@ static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev,
103 struct sk_buff *skb;
104 int i;
105
106- skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12);
107+ skb = dev_alloc_skb(ALIGN(IPOIB_CM_HEAD_SIZE + IPOIB_PSEUDO_LEN, 16));
108 if (unlikely(!skb))
109 return NULL;
110
111 /*
112- * IPoIB adds a 4 byte header. So we need 12 more bytes to align the
113+ * IPoIB adds a IPOIB_ENCAP_LEN byte header, this will align the
114 * IP header to a multiple of 16.
115 */
116- skb_reserve(skb, 12);
117+ skb_reserve(skb, IPOIB_CM_RX_RESERVE);
118
119 mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE,
120 DMA_FROM_DEVICE);
121@@ -624,9 +626,9 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
122 if (wc->byte_len < IPOIB_CM_COPYBREAK) {
123 int dlen = wc->byte_len;
124
125- small_skb = dev_alloc_skb(dlen + 12);
126+ small_skb = dev_alloc_skb(dlen + IPOIB_CM_RX_RESERVE);
127 if (small_skb) {
128- skb_reserve(small_skb, 12);
129+ skb_reserve(small_skb, IPOIB_CM_RX_RESERVE);
130 ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0],
131 dlen, DMA_FROM_DEVICE);
132 skb_copy_from_linear_data(skb, small_skb->data, dlen);
133@@ -663,8 +665,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
134
135 copied:
136 skb->protocol = ((struct ipoib_header *) skb->data)->proto;
137- skb_reset_mac_header(skb);
138- skb_pull(skb, IPOIB_ENCAP_LEN);
139+ skb_add_pseudo_hdr(skb);
140
141 ++dev->stats.rx_packets;
142 dev->stats.rx_bytes += skb->len;
143diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
144index be11d5d..830fecb 100644
145--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
146+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
147@@ -128,16 +128,15 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id)
148
149 buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
150
151- skb = dev_alloc_skb(buf_size + IPOIB_ENCAP_LEN);
152+ skb = dev_alloc_skb(buf_size + IPOIB_HARD_LEN);
153 if (unlikely(!skb))
154 return NULL;
155
156 /*
157- * IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte
158- * header. So we need 4 more bytes to get to 48 and align the
159- * IP header to a multiple of 16.
160+ * the IP header will be at IPOIP_HARD_LEN + IB_GRH_BYTES, that is
161+ * 64 bytes aligned
162 */
163- skb_reserve(skb, 4);
164+ skb_reserve(skb, sizeof(struct ipoib_pseudo_header));
165
166 mapping = priv->rx_ring[id].mapping;
167 mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size,
168@@ -253,8 +252,7 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
169 skb_pull(skb, IB_GRH_BYTES);
170
171 skb->protocol = ((struct ipoib_header *) skb->data)->proto;
172- skb_reset_mac_header(skb);
173- skb_pull(skb, IPOIB_ENCAP_LEN);
174+ skb_add_pseudo_hdr(skb);
175
176 ++dev->stats.rx_packets;
177 dev->stats.rx_bytes += skb->len;
178diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
179index cc1c1b0..823a528 100644
180--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
181+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
182@@ -925,9 +925,12 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
183 ipoib_neigh_free(neigh);
184 goto err_drop;
185 }
186- if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
187+ if (skb_queue_len(&neigh->queue) <
188+ IPOIB_MAX_PATH_REC_QUEUE) {
189+ /* put pseudoheader back on for next time */
190+ skb_push(skb, IPOIB_PSEUDO_LEN);
191 __skb_queue_tail(&neigh->queue, skb);
192- else {
193+ } else {
194 ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
195 skb_queue_len(&neigh->queue));
196 goto err_drop;
197@@ -964,7 +967,7 @@ err_drop:
198 }
199
200 static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
201- struct ipoib_cb *cb)
202+ struct ipoib_pseudo_header *phdr)
203 {
204 struct ipoib_dev_priv *priv = netdev_priv(dev);
205 struct ipoib_path *path;
206@@ -972,16 +975,18 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
207
208 spin_lock_irqsave(&priv->lock, flags);
209
210- path = __path_find(dev, cb->hwaddr + 4);
211+ path = __path_find(dev, phdr->hwaddr + 4);
212 if (!path || !path->valid) {
213 int new_path = 0;
214
215 if (!path) {
216- path = path_rec_create(dev, cb->hwaddr + 4);
217+ path = path_rec_create(dev, phdr->hwaddr + 4);
218 new_path = 1;
219 }
220 if (path) {
221 if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
222+ /* put pseudoheader back on for next time */
223+ skb_push(skb, IPOIB_PSEUDO_LEN);
224 __skb_queue_tail(&path->queue, skb);
225 } else {
226 ++dev->stats.tx_dropped;
227@@ -1009,10 +1014,12 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
228 be16_to_cpu(path->pathrec.dlid));
229
230 spin_unlock_irqrestore(&priv->lock, flags);
231- ipoib_send(dev, skb, path->ah, IPOIB_QPN(cb->hwaddr));
232+ ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr));
233 return;
234 } else if ((path->query || !path_rec_start(dev, path)) &&
235 skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
236+ /* put pseudoheader back on for next time */
237+ skb_push(skb, IPOIB_PSEUDO_LEN);
238 __skb_queue_tail(&path->queue, skb);
239 } else {
240 ++dev->stats.tx_dropped;
241@@ -1026,13 +1033,15 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
242 {
243 struct ipoib_dev_priv *priv = netdev_priv(dev);
244 struct ipoib_neigh *neigh;
245- struct ipoib_cb *cb = ipoib_skb_cb(skb);
246+ struct ipoib_pseudo_header *phdr;
247 struct ipoib_header *header;
248 unsigned long flags;
249
250+ phdr = (struct ipoib_pseudo_header *) skb->data;
251+ skb_pull(skb, sizeof(*phdr));
252 header = (struct ipoib_header *) skb->data;
253
254- if (unlikely(cb->hwaddr[4] == 0xff)) {
255+ if (unlikely(phdr->hwaddr[4] == 0xff)) {
256 /* multicast, arrange "if" according to probability */
257 if ((header->proto != htons(ETH_P_IP)) &&
258 (header->proto != htons(ETH_P_IPV6)) &&
259@@ -1045,13 +1054,13 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
260 return NETDEV_TX_OK;
261 }
262 /* Add in the P_Key for multicast*/
263- cb->hwaddr[8] = (priv->pkey >> 8) & 0xff;
264- cb->hwaddr[9] = priv->pkey & 0xff;
265+ phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff;
266+ phdr->hwaddr[9] = priv->pkey & 0xff;
267
268- neigh = ipoib_neigh_get(dev, cb->hwaddr);
269+ neigh = ipoib_neigh_get(dev, phdr->hwaddr);
270 if (likely(neigh))
271 goto send_using_neigh;
272- ipoib_mcast_send(dev, cb->hwaddr, skb);
273+ ipoib_mcast_send(dev, phdr->hwaddr, skb);
274 return NETDEV_TX_OK;
275 }
276
277@@ -1060,16 +1069,16 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
278 case htons(ETH_P_IP):
279 case htons(ETH_P_IPV6):
280 case htons(ETH_P_TIPC):
281- neigh = ipoib_neigh_get(dev, cb->hwaddr);
282+ neigh = ipoib_neigh_get(dev, phdr->hwaddr);
283 if (unlikely(!neigh)) {
284- neigh_add_path(skb, cb->hwaddr, dev);
285+ neigh_add_path(skb, phdr->hwaddr, dev);
286 return NETDEV_TX_OK;
287 }
288 break;
289 case htons(ETH_P_ARP):
290 case htons(ETH_P_RARP):
291 /* for unicast ARP and RARP should always perform path find */
292- unicast_arp_send(skb, dev, cb);
293+ unicast_arp_send(skb, dev, phdr);
294 return NETDEV_TX_OK;
295 default:
296 /* ethertype not supported by IPoIB */
297@@ -1086,11 +1095,13 @@ send_using_neigh:
298 goto unref;
299 }
300 } else if (neigh->ah) {
301- ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr));
302+ ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(phdr->hwaddr));
303 goto unref;
304 }
305
306 if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
307+ /* put pseudoheader back on for next time */
308+ skb_push(skb, sizeof(*phdr));
309 spin_lock_irqsave(&priv->lock, flags);
310 __skb_queue_tail(&neigh->queue, skb);
311 spin_unlock_irqrestore(&priv->lock, flags);
312@@ -1122,8 +1133,8 @@ static int ipoib_hard_header(struct sk_buff *skb,
313 unsigned short type,
314 const void *daddr, const void *saddr, unsigned len)
315 {
316+ struct ipoib_pseudo_header *phdr;
317 struct ipoib_header *header;
318- struct ipoib_cb *cb = ipoib_skb_cb(skb);
319
320 header = (struct ipoib_header *) skb_push(skb, sizeof *header);
321
322@@ -1132,12 +1143,13 @@ static int ipoib_hard_header(struct sk_buff *skb,
323
324 /*
325 * we don't rely on dst_entry structure, always stuff the
326- * destination address into skb->cb so we can figure out where
327+ * destination address into skb hard header so we can figure out where
328 * to send the packet later.
329 */
330- memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN);
331+ phdr = (struct ipoib_pseudo_header *) skb_push(skb, sizeof(*phdr));
332+ memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN);
333
334- return sizeof *header;
335+ return IPOIB_HARD_LEN;
336 }
337
338 static void ipoib_set_mcast_list(struct net_device *dev)
339@@ -1759,7 +1771,7 @@ void ipoib_setup(struct net_device *dev)
340
341 dev->flags |= IFF_BROADCAST | IFF_MULTICAST;
342
343- dev->hard_header_len = IPOIB_ENCAP_LEN;
344+ dev->hard_header_len = IPOIB_HARD_LEN;
345 dev->addr_len = INFINIBAND_ALEN;
346 dev->type = ARPHRD_INFINIBAND;
347 dev->tx_queue_len = ipoib_sendq_size * 2;
348diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
349index d3394b6..1909dd2 100644
350--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
351+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
352@@ -796,9 +796,11 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)
353 __ipoib_mcast_add(dev, mcast);
354 list_add_tail(&mcast->list, &priv->multicast_list);
355 }
356- if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE)
357+ if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE) {
358+ /* put pseudoheader back on for next time */
359+ skb_push(skb, sizeof(struct ipoib_pseudo_header));
360 skb_queue_tail(&mcast->pkt_queue, skb);
361- else {
362+ } else {
363 ++dev->stats.tx_dropped;
364 dev_kfree_skb_any(skb);
365 }