]>
Commit | Line | Data |
---|---|---|
1597fc8b WB |
1 | From patchwork Wed Oct 12 14:30:30 2016 |
2 | Content-Type: text/plain; charset="utf-8" | |
3 | MIME-Version: 1.0 | |
4 | Content-Transfer-Encoding: 7bit | |
5 | Subject: [v2] IB/ipoib: move back IB LL address into the hard header | |
6 | From: Paolo Abeni <pabeni@redhat.com> | |
7 | X-Patchwork-Id: 681344 | |
8 | X-Patchwork-Delegate: davem@davemloft.net | |
9 | Message-Id: <60efcf739ce3d45a01a7127dbaf7fe366e5ddce4.1476264804.git.pabeni@redhat.com> | |
10 | To: linux-rdma@vger.kernel.org | |
11 | Cc: Doug Ledford <dledford@redhat.com>, Sean Hefty <sean.hefty@intel.com>, | |
12 | Hal Rosenstock <hal.rosenstock@gmail.com>, | |
13 | Jason Gunthorpe <jgunthorpe@obsidianresearch.com>, netdev@vger.kernel.org | |
14 | Date: Wed, 12 Oct 2016 16:30:30 +0200 | |
15 | ||
16 | After the commit 9207f9d45b0a ("net: preserve IP control block | |
17 | during GSO segmentation"), the GSO CB and the IPoIB CB conflict. | |
18 | That destroy the IPoIB address information cached there, | |
19 | causing a severe performance regression, as better described here: | |
20 | ||
21 | http://marc.info/?l=linux-kernel&m=146787279825501&w=2 | |
22 | ||
23 | This change moves the data cached by the IPoIB driver from the | |
24 | skb control lock into the IPoIB hard header, as done before | |
25 | the commit 936d7de3d736 ("IPoIB: Stop lying about hard_header_len | |
26 | and use skb->cb to stash LL addresses"). | |
27 | In order to avoid GRO issue, on packet reception, the IPoIB driver | |
28 | stash into the skb a dummy pseudo header, so that the received | |
29 | packets have actually a hard header matching the declared length. | |
30 | To avoid changing the connected mode maximum mtu, the allocated | |
31 | head buffer size is increased by the pseudo header length. | |
32 | ||
33 | After this commit, IPoIB performances are back to pre-regression | |
34 | value. | |
35 | ||
36 | v1 -> v2: avoid changing the max mtu, increasing the head buf size | |
37 | ||
38 | Fixes: 9207f9d45b0a ("net: preserve IP control block during GSO segmentation") | |
39 | Signed-off-by: Paolo Abeni <pabeni@redhat.com> | |
40 | --- | |
41 | drivers/infiniband/ulp/ipoib/ipoib.h | 20 +++++++--- | |
42 | drivers/infiniband/ulp/ipoib/ipoib_cm.c | 15 +++---- | |
43 | drivers/infiniband/ulp/ipoib/ipoib_ib.c | 12 +++--- | |
44 | drivers/infiniband/ulp/ipoib/ipoib_main.c | 54 ++++++++++++++++---------- | |
45 | drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 6 ++- | |
46 | 5 files changed, 64 insertions(+), 43 deletions(-) | |
47 | ||
48 | diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h | |
49 | index 9dbfcc0..5ff64af 100644 | |
50 | --- a/drivers/infiniband/ulp/ipoib/ipoib.h | |
51 | +++ b/drivers/infiniband/ulp/ipoib/ipoib.h | |
52 | @@ -63,6 +63,8 @@ enum ipoib_flush_level { | |
53 | ||
54 | enum { | |
55 | IPOIB_ENCAP_LEN = 4, | |
56 | + IPOIB_PSEUDO_LEN = 20, | |
57 | + IPOIB_HARD_LEN = IPOIB_ENCAP_LEN + IPOIB_PSEUDO_LEN, | |
58 | ||
59 | IPOIB_UD_HEAD_SIZE = IB_GRH_BYTES + IPOIB_ENCAP_LEN, | |
60 | IPOIB_UD_RX_SG = 2, /* max buffer needed for 4K mtu */ | |
61 | @@ -134,15 +136,21 @@ struct ipoib_header { | |
62 | u16 reserved; | |
63 | }; | |
64 | ||
65 | -struct ipoib_cb { | |
66 | - struct qdisc_skb_cb qdisc_cb; | |
67 | - u8 hwaddr[INFINIBAND_ALEN]; | |
68 | +struct ipoib_pseudo_header { | |
69 | + u8 hwaddr[INFINIBAND_ALEN]; | |
70 | }; | |
71 | ||
72 | -static inline struct ipoib_cb *ipoib_skb_cb(const struct sk_buff *skb) | |
73 | +static inline void skb_add_pseudo_hdr(struct sk_buff *skb) | |
74 | { | |
75 | - BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct ipoib_cb)); | |
76 | - return (struct ipoib_cb *)skb->cb; | |
77 | + char *data = skb_push(skb, IPOIB_PSEUDO_LEN); | |
78 | + | |
79 | + /* | |
80 | + * only the ipoib header is present now, make room for a dummy | |
81 | + * pseudo header and set skb field accordingly | |
82 | + */ | |
83 | + memset(data, 0, IPOIB_PSEUDO_LEN); | |
84 | + skb_reset_mac_header(skb); | |
85 | + skb_pull(skb, IPOIB_HARD_LEN); | |
86 | } | |
87 | ||
88 | /* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */ | |
89 | diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c | |
90 | index 4ad297d..339a1ee 100644 | |
91 | --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c | |
92 | +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c | |
93 | @@ -63,6 +63,8 @@ MODULE_PARM_DESC(cm_data_debug_level, | |
94 | #define IPOIB_CM_RX_DELAY (3 * 256 * HZ) | |
95 | #define IPOIB_CM_RX_UPDATE_MASK (0x3) | |
96 | ||
97 | +#define IPOIB_CM_RX_RESERVE (ALIGN(IPOIB_HARD_LEN, 16) - IPOIB_ENCAP_LEN) | |
98 | + | |
99 | static struct ib_qp_attr ipoib_cm_err_attr = { | |
100 | .qp_state = IB_QPS_ERR | |
101 | }; | |
102 | @@ -146,15 +148,15 @@ static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, | |
103 | struct sk_buff *skb; | |
104 | int i; | |
105 | ||
106 | - skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12); | |
107 | + skb = dev_alloc_skb(ALIGN(IPOIB_CM_HEAD_SIZE + IPOIB_PSEUDO_LEN, 16)); | |
108 | if (unlikely(!skb)) | |
109 | return NULL; | |
110 | ||
111 | /* | |
112 | - * IPoIB adds a 4 byte header. So we need 12 more bytes to align the | |
113 | + * IPoIB adds a IPOIB_ENCAP_LEN byte header, this will align the | |
114 | * IP header to a multiple of 16. | |
115 | */ | |
116 | - skb_reserve(skb, 12); | |
117 | + skb_reserve(skb, IPOIB_CM_RX_RESERVE); | |
118 | ||
119 | mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE, | |
120 | DMA_FROM_DEVICE); | |
121 | @@ -624,9 +626,9 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) | |
122 | if (wc->byte_len < IPOIB_CM_COPYBREAK) { | |
123 | int dlen = wc->byte_len; | |
124 | ||
125 | - small_skb = dev_alloc_skb(dlen + 12); | |
126 | + small_skb = dev_alloc_skb(dlen + IPOIB_CM_RX_RESERVE); | |
127 | if (small_skb) { | |
128 | - skb_reserve(small_skb, 12); | |
129 | + skb_reserve(small_skb, IPOIB_CM_RX_RESERVE); | |
130 | ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0], | |
131 | dlen, DMA_FROM_DEVICE); | |
132 | skb_copy_from_linear_data(skb, small_skb->data, dlen); | |
133 | @@ -663,8 +665,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) | |
134 | ||
135 | copied: | |
136 | skb->protocol = ((struct ipoib_header *) skb->data)->proto; | |
137 | - skb_reset_mac_header(skb); | |
138 | - skb_pull(skb, IPOIB_ENCAP_LEN); | |
139 | + skb_add_pseudo_hdr(skb); | |
140 | ||
141 | ++dev->stats.rx_packets; | |
142 | dev->stats.rx_bytes += skb->len; | |
143 | diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c | |
144 | index be11d5d..830fecb 100644 | |
145 | --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c | |
146 | +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c | |
147 | @@ -128,16 +128,15 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id) | |
148 | ||
149 | buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); | |
150 | ||
151 | - skb = dev_alloc_skb(buf_size + IPOIB_ENCAP_LEN); | |
152 | + skb = dev_alloc_skb(buf_size + IPOIB_HARD_LEN); | |
153 | if (unlikely(!skb)) | |
154 | return NULL; | |
155 | ||
156 | /* | |
157 | - * IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte | |
158 | - * header. So we need 4 more bytes to get to 48 and align the | |
159 | - * IP header to a multiple of 16. | |
160 | + * the IP header will be at IPOIP_HARD_LEN + IB_GRH_BYTES, that is | |
161 | + * 64 bytes aligned | |
162 | */ | |
163 | - skb_reserve(skb, 4); | |
164 | + skb_reserve(skb, sizeof(struct ipoib_pseudo_header)); | |
165 | ||
166 | mapping = priv->rx_ring[id].mapping; | |
167 | mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size, | |
168 | @@ -253,8 +252,7 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) | |
169 | skb_pull(skb, IB_GRH_BYTES); | |
170 | ||
171 | skb->protocol = ((struct ipoib_header *) skb->data)->proto; | |
172 | - skb_reset_mac_header(skb); | |
173 | - skb_pull(skb, IPOIB_ENCAP_LEN); | |
174 | + skb_add_pseudo_hdr(skb); | |
175 | ||
176 | ++dev->stats.rx_packets; | |
177 | dev->stats.rx_bytes += skb->len; | |
178 | diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c | |
179 | index cc1c1b0..823a528 100644 | |
180 | --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c | |
181 | +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c | |
182 | @@ -925,9 +925,12 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr, | |
183 | ipoib_neigh_free(neigh); | |
184 | goto err_drop; | |
185 | } | |
186 | - if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) | |
187 | + if (skb_queue_len(&neigh->queue) < | |
188 | + IPOIB_MAX_PATH_REC_QUEUE) { | |
189 | + /* put pseudoheader back on for next time */ | |
190 | + skb_push(skb, IPOIB_PSEUDO_LEN); | |
191 | __skb_queue_tail(&neigh->queue, skb); | |
192 | - else { | |
193 | + } else { | |
194 | ipoib_warn(priv, "queue length limit %d. Packet drop.\n", | |
195 | skb_queue_len(&neigh->queue)); | |
196 | goto err_drop; | |
197 | @@ -964,7 +967,7 @@ err_drop: | |
198 | } | |
199 | ||
200 | static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, | |
201 | - struct ipoib_cb *cb) | |
202 | + struct ipoib_pseudo_header *phdr) | |
203 | { | |
204 | struct ipoib_dev_priv *priv = netdev_priv(dev); | |
205 | struct ipoib_path *path; | |
206 | @@ -972,16 +975,18 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, | |
207 | ||
208 | spin_lock_irqsave(&priv->lock, flags); | |
209 | ||
210 | - path = __path_find(dev, cb->hwaddr + 4); | |
211 | + path = __path_find(dev, phdr->hwaddr + 4); | |
212 | if (!path || !path->valid) { | |
213 | int new_path = 0; | |
214 | ||
215 | if (!path) { | |
216 | - path = path_rec_create(dev, cb->hwaddr + 4); | |
217 | + path = path_rec_create(dev, phdr->hwaddr + 4); | |
218 | new_path = 1; | |
219 | } | |
220 | if (path) { | |
221 | if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { | |
222 | + /* put pseudoheader back on for next time */ | |
223 | + skb_push(skb, IPOIB_PSEUDO_LEN); | |
224 | __skb_queue_tail(&path->queue, skb); | |
225 | } else { | |
226 | ++dev->stats.tx_dropped; | |
227 | @@ -1009,10 +1014,12 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, | |
228 | be16_to_cpu(path->pathrec.dlid)); | |
229 | ||
230 | spin_unlock_irqrestore(&priv->lock, flags); | |
231 | - ipoib_send(dev, skb, path->ah, IPOIB_QPN(cb->hwaddr)); | |
232 | + ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr)); | |
233 | return; | |
234 | } else if ((path->query || !path_rec_start(dev, path)) && | |
235 | skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { | |
236 | + /* put pseudoheader back on for next time */ | |
237 | + skb_push(skb, IPOIB_PSEUDO_LEN); | |
238 | __skb_queue_tail(&path->queue, skb); | |
239 | } else { | |
240 | ++dev->stats.tx_dropped; | |
241 | @@ -1026,13 +1033,15 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) | |
242 | { | |
243 | struct ipoib_dev_priv *priv = netdev_priv(dev); | |
244 | struct ipoib_neigh *neigh; | |
245 | - struct ipoib_cb *cb = ipoib_skb_cb(skb); | |
246 | + struct ipoib_pseudo_header *phdr; | |
247 | struct ipoib_header *header; | |
248 | unsigned long flags; | |
249 | ||
250 | + phdr = (struct ipoib_pseudo_header *) skb->data; | |
251 | + skb_pull(skb, sizeof(*phdr)); | |
252 | header = (struct ipoib_header *) skb->data; | |
253 | ||
254 | - if (unlikely(cb->hwaddr[4] == 0xff)) { | |
255 | + if (unlikely(phdr->hwaddr[4] == 0xff)) { | |
256 | /* multicast, arrange "if" according to probability */ | |
257 | if ((header->proto != htons(ETH_P_IP)) && | |
258 | (header->proto != htons(ETH_P_IPV6)) && | |
259 | @@ -1045,13 +1054,13 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) | |
260 | return NETDEV_TX_OK; | |
261 | } | |
262 | /* Add in the P_Key for multicast*/ | |
263 | - cb->hwaddr[8] = (priv->pkey >> 8) & 0xff; | |
264 | - cb->hwaddr[9] = priv->pkey & 0xff; | |
265 | + phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff; | |
266 | + phdr->hwaddr[9] = priv->pkey & 0xff; | |
267 | ||
268 | - neigh = ipoib_neigh_get(dev, cb->hwaddr); | |
269 | + neigh = ipoib_neigh_get(dev, phdr->hwaddr); | |
270 | if (likely(neigh)) | |
271 | goto send_using_neigh; | |
272 | - ipoib_mcast_send(dev, cb->hwaddr, skb); | |
273 | + ipoib_mcast_send(dev, phdr->hwaddr, skb); | |
274 | return NETDEV_TX_OK; | |
275 | } | |
276 | ||
277 | @@ -1060,16 +1069,16 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) | |
278 | case htons(ETH_P_IP): | |
279 | case htons(ETH_P_IPV6): | |
280 | case htons(ETH_P_TIPC): | |
281 | - neigh = ipoib_neigh_get(dev, cb->hwaddr); | |
282 | + neigh = ipoib_neigh_get(dev, phdr->hwaddr); | |
283 | if (unlikely(!neigh)) { | |
284 | - neigh_add_path(skb, cb->hwaddr, dev); | |
285 | + neigh_add_path(skb, phdr->hwaddr, dev); | |
286 | return NETDEV_TX_OK; | |
287 | } | |
288 | break; | |
289 | case htons(ETH_P_ARP): | |
290 | case htons(ETH_P_RARP): | |
291 | /* for unicast ARP and RARP should always perform path find */ | |
292 | - unicast_arp_send(skb, dev, cb); | |
293 | + unicast_arp_send(skb, dev, phdr); | |
294 | return NETDEV_TX_OK; | |
295 | default: | |
296 | /* ethertype not supported by IPoIB */ | |
297 | @@ -1086,11 +1095,13 @@ send_using_neigh: | |
298 | goto unref; | |
299 | } | |
300 | } else if (neigh->ah) { | |
301 | - ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr)); | |
302 | + ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(phdr->hwaddr)); | |
303 | goto unref; | |
304 | } | |
305 | ||
306 | if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { | |
307 | + /* put pseudoheader back on for next time */ | |
308 | + skb_push(skb, sizeof(*phdr)); | |
309 | spin_lock_irqsave(&priv->lock, flags); | |
310 | __skb_queue_tail(&neigh->queue, skb); | |
311 | spin_unlock_irqrestore(&priv->lock, flags); | |
312 | @@ -1122,8 +1133,8 @@ static int ipoib_hard_header(struct sk_buff *skb, | |
313 | unsigned short type, | |
314 | const void *daddr, const void *saddr, unsigned len) | |
315 | { | |
316 | + struct ipoib_pseudo_header *phdr; | |
317 | struct ipoib_header *header; | |
318 | - struct ipoib_cb *cb = ipoib_skb_cb(skb); | |
319 | ||
320 | header = (struct ipoib_header *) skb_push(skb, sizeof *header); | |
321 | ||
322 | @@ -1132,12 +1143,13 @@ static int ipoib_hard_header(struct sk_buff *skb, | |
323 | ||
324 | /* | |
325 | * we don't rely on dst_entry structure, always stuff the | |
326 | - * destination address into skb->cb so we can figure out where | |
327 | + * destination address into skb hard header so we can figure out where | |
328 | * to send the packet later. | |
329 | */ | |
330 | - memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN); | |
331 | + phdr = (struct ipoib_pseudo_header *) skb_push(skb, sizeof(*phdr)); | |
332 | + memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN); | |
333 | ||
334 | - return sizeof *header; | |
335 | + return IPOIB_HARD_LEN; | |
336 | } | |
337 | ||
338 | static void ipoib_set_mcast_list(struct net_device *dev) | |
339 | @@ -1759,7 +1771,7 @@ void ipoib_setup(struct net_device *dev) | |
340 | ||
341 | dev->flags |= IFF_BROADCAST | IFF_MULTICAST; | |
342 | ||
343 | - dev->hard_header_len = IPOIB_ENCAP_LEN; | |
344 | + dev->hard_header_len = IPOIB_HARD_LEN; | |
345 | dev->addr_len = INFINIBAND_ALEN; | |
346 | dev->type = ARPHRD_INFINIBAND; | |
347 | dev->tx_queue_len = ipoib_sendq_size * 2; | |
348 | diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c | |
349 | index d3394b6..1909dd2 100644 | |
350 | --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c | |
351 | +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c | |
352 | @@ -796,9 +796,11 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb) | |
353 | __ipoib_mcast_add(dev, mcast); | |
354 | list_add_tail(&mcast->list, &priv->multicast_list); | |
355 | } | |
356 | - if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE) | |
357 | + if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE) { | |
358 | + /* put pseudoheader back on for next time */ | |
359 | + skb_push(skb, sizeof(struct ipoib_pseudo_header)); | |
360 | skb_queue_tail(&mcast->pkt_queue, skb); | |
361 | - else { | |
362 | + } else { | |
363 | ++dev->stats.tx_dropped; | |
364 | dev_kfree_skb_any(skb); | |
365 | } |