2 * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34 #include <net/busy_poll.h>
35 #include <linux/mlx4/cq.h>
36 #include <linux/slab.h>
37 #include <linux/mlx4/qp.h>
38 #include <linux/skbuff.h>
39 #include <linux/rculist.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/vmalloc.h>
43 #include <linux/irq.h>
45 #if IS_ENABLED(CONFIG_IPV6)
46 #include <net/ip6_checksum.h>
51 static int mlx4_alloc_pages(struct mlx4_en_priv
*priv
,
52 struct mlx4_en_rx_alloc
*page_alloc
,
53 const struct mlx4_en_frag_info
*frag_info
,
60 for (order
= MLX4_EN_ALLOC_PREFER_ORDER
; ;) {
64 gfp
|= __GFP_COMP
| __GFP_NOWARN
;
65 page
= alloc_pages(gfp
, order
);
69 ((PAGE_SIZE
<< order
) < frag_info
->frag_size
))
72 dma
= dma_map_page(priv
->ddev
, page
, 0, PAGE_SIZE
<< order
,
74 if (dma_mapping_error(priv
->ddev
, dma
)) {
78 page_alloc
->page_size
= PAGE_SIZE
<< order
;
79 page_alloc
->page
= page
;
80 page_alloc
->dma
= dma
;
81 page_alloc
->page_offset
= 0;
82 /* Not doing get_page() for each frag is a big win
83 * on asymetric workloads. Note we can not use atomic_set().
85 atomic_add(page_alloc
->page_size
/ frag_info
->frag_stride
- 1,
90 static int mlx4_en_alloc_frags(struct mlx4_en_priv
*priv
,
91 struct mlx4_en_rx_desc
*rx_desc
,
92 struct mlx4_en_rx_alloc
*frags
,
93 struct mlx4_en_rx_alloc
*ring_alloc
,
96 struct mlx4_en_rx_alloc page_alloc
[MLX4_EN_MAX_RX_FRAGS
];
97 const struct mlx4_en_frag_info
*frag_info
;
102 for (i
= 0; i
< priv
->num_frags
; i
++) {
103 frag_info
= &priv
->frag_info
[i
];
104 page_alloc
[i
] = ring_alloc
[i
];
105 page_alloc
[i
].page_offset
+= frag_info
->frag_stride
;
107 if (page_alloc
[i
].page_offset
+ frag_info
->frag_stride
<=
108 ring_alloc
[i
].page_size
)
111 if (mlx4_alloc_pages(priv
, &page_alloc
[i
], frag_info
, gfp
))
115 for (i
= 0; i
< priv
->num_frags
; i
++) {
116 frags
[i
] = ring_alloc
[i
];
117 dma
= ring_alloc
[i
].dma
+ ring_alloc
[i
].page_offset
;
118 ring_alloc
[i
] = page_alloc
[i
];
119 rx_desc
->data
[i
].addr
= cpu_to_be64(dma
);
126 if (page_alloc
[i
].page
!= ring_alloc
[i
].page
) {
127 dma_unmap_page(priv
->ddev
, page_alloc
[i
].dma
,
128 page_alloc
[i
].page_size
, PCI_DMA_FROMDEVICE
);
129 page
= page_alloc
[i
].page
;
130 atomic_set(&page
->_count
, 1);
137 static void mlx4_en_free_frag(struct mlx4_en_priv
*priv
,
138 struct mlx4_en_rx_alloc
*frags
,
141 const struct mlx4_en_frag_info
*frag_info
= &priv
->frag_info
[i
];
142 u32 next_frag_end
= frags
[i
].page_offset
+ 2 * frag_info
->frag_stride
;
145 if (next_frag_end
> frags
[i
].page_size
)
146 dma_unmap_page(priv
->ddev
, frags
[i
].dma
, frags
[i
].page_size
,
150 put_page(frags
[i
].page
);
153 static int mlx4_en_init_allocator(struct mlx4_en_priv
*priv
,
154 struct mlx4_en_rx_ring
*ring
)
157 struct mlx4_en_rx_alloc
*page_alloc
;
159 for (i
= 0; i
< priv
->num_frags
; i
++) {
160 const struct mlx4_en_frag_info
*frag_info
= &priv
->frag_info
[i
];
162 if (mlx4_alloc_pages(priv
, &ring
->page_alloc
[i
],
163 frag_info
, GFP_KERNEL
| __GFP_COLD
))
166 en_dbg(DRV
, priv
, " frag %d allocator: - size:%d frags:%d\n",
167 i
, ring
->page_alloc
[i
].page_size
,
168 atomic_read(&ring
->page_alloc
[i
].page
->_count
));
176 page_alloc
= &ring
->page_alloc
[i
];
177 dma_unmap_page(priv
->ddev
, page_alloc
->dma
,
178 page_alloc
->page_size
, PCI_DMA_FROMDEVICE
);
179 page
= page_alloc
->page
;
180 atomic_set(&page
->_count
, 1);
182 page_alloc
->page
= NULL
;
187 static void mlx4_en_destroy_allocator(struct mlx4_en_priv
*priv
,
188 struct mlx4_en_rx_ring
*ring
)
190 struct mlx4_en_rx_alloc
*page_alloc
;
193 for (i
= 0; i
< priv
->num_frags
; i
++) {
194 const struct mlx4_en_frag_info
*frag_info
= &priv
->frag_info
[i
];
196 page_alloc
= &ring
->page_alloc
[i
];
197 en_dbg(DRV
, priv
, "Freeing allocator:%d count:%d\n",
198 i
, page_count(page_alloc
->page
));
200 dma_unmap_page(priv
->ddev
, page_alloc
->dma
,
201 page_alloc
->page_size
, PCI_DMA_FROMDEVICE
);
202 while (page_alloc
->page_offset
+ frag_info
->frag_stride
<
203 page_alloc
->page_size
) {
204 put_page(page_alloc
->page
);
205 page_alloc
->page_offset
+= frag_info
->frag_stride
;
207 page_alloc
->page
= NULL
;
211 static void mlx4_en_init_rx_desc(struct mlx4_en_priv
*priv
,
212 struct mlx4_en_rx_ring
*ring
, int index
)
214 struct mlx4_en_rx_desc
*rx_desc
= ring
->buf
+ ring
->stride
* index
;
218 /* Set size and memtype fields */
219 for (i
= 0; i
< priv
->num_frags
; i
++) {
220 rx_desc
->data
[i
].byte_count
=
221 cpu_to_be32(priv
->frag_info
[i
].frag_size
);
222 rx_desc
->data
[i
].lkey
= cpu_to_be32(priv
->mdev
->mr
.key
);
225 /* If the number of used fragments does not fill up the ring stride,
226 * remaining (unused) fragments must be padded with null address/size
227 * and a special memory key */
228 possible_frags
= (ring
->stride
- sizeof(struct mlx4_en_rx_desc
)) / DS_SIZE
;
229 for (i
= priv
->num_frags
; i
< possible_frags
; i
++) {
230 rx_desc
->data
[i
].byte_count
= 0;
231 rx_desc
->data
[i
].lkey
= cpu_to_be32(MLX4_EN_MEMTYPE_PAD
);
232 rx_desc
->data
[i
].addr
= 0;
236 static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv
*priv
,
237 struct mlx4_en_rx_ring
*ring
, int index
,
240 struct mlx4_en_rx_desc
*rx_desc
= ring
->buf
+ (index
* ring
->stride
);
241 struct mlx4_en_rx_alloc
*frags
= ring
->rx_info
+
242 (index
<< priv
->log_rx_info
);
244 return mlx4_en_alloc_frags(priv
, rx_desc
, frags
, ring
->page_alloc
, gfp
);
247 static inline void mlx4_en_update_rx_prod_db(struct mlx4_en_rx_ring
*ring
)
249 *ring
->wqres
.db
.db
= cpu_to_be32(ring
->prod
& 0xffff);
252 static void mlx4_en_free_rx_desc(struct mlx4_en_priv
*priv
,
253 struct mlx4_en_rx_ring
*ring
,
256 struct mlx4_en_rx_alloc
*frags
;
259 frags
= ring
->rx_info
+ (index
<< priv
->log_rx_info
);
260 for (nr
= 0; nr
< priv
->num_frags
; nr
++) {
261 en_dbg(DRV
, priv
, "Freeing fragment:%d\n", nr
);
262 mlx4_en_free_frag(priv
, frags
, nr
);
266 static int mlx4_en_fill_rx_buffers(struct mlx4_en_priv
*priv
)
268 struct mlx4_en_rx_ring
*ring
;
273 for (buf_ind
= 0; buf_ind
< priv
->prof
->rx_ring_size
; buf_ind
++) {
274 for (ring_ind
= 0; ring_ind
< priv
->rx_ring_num
; ring_ind
++) {
275 ring
= priv
->rx_ring
[ring_ind
];
277 if (mlx4_en_prepare_rx_desc(priv
, ring
,
279 GFP_KERNEL
| __GFP_COLD
)) {
280 if (ring
->actual_size
< MLX4_EN_MIN_RX_SIZE
) {
281 en_err(priv
, "Failed to allocate enough rx buffers\n");
284 new_size
= rounddown_pow_of_two(ring
->actual_size
);
285 en_warn(priv
, "Only %d buffers allocated reducing ring size to %d\n",
286 ring
->actual_size
, new_size
);
297 for (ring_ind
= 0; ring_ind
< priv
->rx_ring_num
; ring_ind
++) {
298 ring
= priv
->rx_ring
[ring_ind
];
299 while (ring
->actual_size
> new_size
) {
302 mlx4_en_free_rx_desc(priv
, ring
, ring
->actual_size
);
309 static void mlx4_en_free_rx_buf(struct mlx4_en_priv
*priv
,
310 struct mlx4_en_rx_ring
*ring
)
314 en_dbg(DRV
, priv
, "Freeing Rx buf - cons:%d prod:%d\n",
315 ring
->cons
, ring
->prod
);
317 /* Unmap and free Rx buffers */
318 BUG_ON((u32
) (ring
->prod
- ring
->cons
) > ring
->actual_size
);
319 while (ring
->cons
!= ring
->prod
) {
320 index
= ring
->cons
& ring
->size_mask
;
321 en_dbg(DRV
, priv
, "Processing descriptor:%d\n", index
);
322 mlx4_en_free_rx_desc(priv
, ring
, index
);
327 void mlx4_en_set_num_rx_rings(struct mlx4_en_dev
*mdev
)
332 struct mlx4_dev
*dev
= mdev
->dev
;
334 mlx4_foreach_port(i
, dev
, MLX4_PORT_TYPE_ETH
) {
335 if (!dev
->caps
.comp_pool
)
336 num_of_eqs
= max_t(int, MIN_RX_RINGS
,
338 dev
->caps
.num_comp_vectors
,
341 num_of_eqs
= min_t(int, MAX_MSIX_P_PORT
,
343 dev
->caps
.num_ports
) - 1;
345 num_rx_rings
= mlx4_low_memory_profile() ? MIN_RX_RINGS
:
346 min_t(int, num_of_eqs
,
347 netif_get_num_default_rss_queues());
348 mdev
->profile
.prof
[i
].rx_ring_num
=
349 rounddown_pow_of_two(num_rx_rings
);
353 int mlx4_en_create_rx_ring(struct mlx4_en_priv
*priv
,
354 struct mlx4_en_rx_ring
**pring
,
355 u32 size
, u16 stride
, int node
)
357 struct mlx4_en_dev
*mdev
= priv
->mdev
;
358 struct mlx4_en_rx_ring
*ring
;
362 ring
= kzalloc_node(sizeof(*ring
), GFP_KERNEL
, node
);
364 ring
= kzalloc(sizeof(*ring
), GFP_KERNEL
);
366 en_err(priv
, "Failed to allocate RX ring structure\n");
374 ring
->size_mask
= size
- 1;
375 ring
->stride
= stride
;
376 ring
->log_stride
= ffs(ring
->stride
) - 1;
377 ring
->buf_size
= ring
->size
* ring
->stride
+ TXBB_SIZE
;
379 tmp
= size
* roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS
*
380 sizeof(struct mlx4_en_rx_alloc
));
381 ring
->rx_info
= vmalloc_node(tmp
, node
);
382 if (!ring
->rx_info
) {
383 ring
->rx_info
= vmalloc(tmp
);
384 if (!ring
->rx_info
) {
390 en_dbg(DRV
, priv
, "Allocated rx_info ring at addr:%p size:%d\n",
393 /* Allocate HW buffers on provided NUMA node */
394 set_dev_node(&mdev
->dev
->persist
->pdev
->dev
, node
);
395 err
= mlx4_alloc_hwq_res(mdev
->dev
, &ring
->wqres
,
396 ring
->buf_size
, 2 * PAGE_SIZE
);
397 set_dev_node(&mdev
->dev
->persist
->pdev
->dev
, mdev
->dev
->numa_node
);
401 err
= mlx4_en_map_buffer(&ring
->wqres
.buf
);
403 en_err(priv
, "Failed to map RX buffer\n");
406 ring
->buf
= ring
->wqres
.buf
.direct
.buf
;
408 ring
->hwtstamp_rx_filter
= priv
->hwtstamp_config
.rx_filter
;
414 mlx4_free_hwq_res(mdev
->dev
, &ring
->wqres
, ring
->buf_size
);
416 vfree(ring
->rx_info
);
417 ring
->rx_info
= NULL
;
425 int mlx4_en_activate_rx_rings(struct mlx4_en_priv
*priv
)
427 struct mlx4_en_rx_ring
*ring
;
431 int stride
= roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc
) +
432 DS_SIZE
* priv
->num_frags
);
434 for (ring_ind
= 0; ring_ind
< priv
->rx_ring_num
; ring_ind
++) {
435 ring
= priv
->rx_ring
[ring_ind
];
439 ring
->actual_size
= 0;
440 ring
->cqn
= priv
->rx_cq
[ring_ind
]->mcq
.cqn
;
442 ring
->stride
= stride
;
443 if (ring
->stride
<= TXBB_SIZE
)
444 ring
->buf
+= TXBB_SIZE
;
446 ring
->log_stride
= ffs(ring
->stride
) - 1;
447 ring
->buf_size
= ring
->size
* ring
->stride
;
449 memset(ring
->buf
, 0, ring
->buf_size
);
450 mlx4_en_update_rx_prod_db(ring
);
452 /* Initialize all descriptors */
453 for (i
= 0; i
< ring
->size
; i
++)
454 mlx4_en_init_rx_desc(priv
, ring
, i
);
456 /* Initialize page allocators */
457 err
= mlx4_en_init_allocator(priv
, ring
);
459 en_err(priv
, "Failed initializing ring allocator\n");
460 if (ring
->stride
<= TXBB_SIZE
)
461 ring
->buf
-= TXBB_SIZE
;
466 err
= mlx4_en_fill_rx_buffers(priv
);
470 for (ring_ind
= 0; ring_ind
< priv
->rx_ring_num
; ring_ind
++) {
471 ring
= priv
->rx_ring
[ring_ind
];
473 ring
->size_mask
= ring
->actual_size
- 1;
474 mlx4_en_update_rx_prod_db(ring
);
480 for (ring_ind
= 0; ring_ind
< priv
->rx_ring_num
; ring_ind
++)
481 mlx4_en_free_rx_buf(priv
, priv
->rx_ring
[ring_ind
]);
483 ring_ind
= priv
->rx_ring_num
- 1;
485 while (ring_ind
>= 0) {
486 if (priv
->rx_ring
[ring_ind
]->stride
<= TXBB_SIZE
)
487 priv
->rx_ring
[ring_ind
]->buf
-= TXBB_SIZE
;
488 mlx4_en_destroy_allocator(priv
, priv
->rx_ring
[ring_ind
]);
494 void mlx4_en_destroy_rx_ring(struct mlx4_en_priv
*priv
,
495 struct mlx4_en_rx_ring
**pring
,
496 u32 size
, u16 stride
)
498 struct mlx4_en_dev
*mdev
= priv
->mdev
;
499 struct mlx4_en_rx_ring
*ring
= *pring
;
501 mlx4_en_unmap_buffer(&ring
->wqres
.buf
);
502 mlx4_free_hwq_res(mdev
->dev
, &ring
->wqres
, size
* stride
+ TXBB_SIZE
);
503 vfree(ring
->rx_info
);
504 ring
->rx_info
= NULL
;
507 #ifdef CONFIG_RFS_ACCEL
508 mlx4_en_cleanup_filters(priv
);
512 void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv
*priv
,
513 struct mlx4_en_rx_ring
*ring
)
515 mlx4_en_free_rx_buf(priv
, ring
);
516 if (ring
->stride
<= TXBB_SIZE
)
517 ring
->buf
-= TXBB_SIZE
;
518 mlx4_en_destroy_allocator(priv
, ring
);
522 static int mlx4_en_complete_rx_desc(struct mlx4_en_priv
*priv
,
523 struct mlx4_en_rx_desc
*rx_desc
,
524 struct mlx4_en_rx_alloc
*frags
,
528 struct skb_frag_struct
*skb_frags_rx
= skb_shinfo(skb
)->frags
;
529 struct mlx4_en_frag_info
*frag_info
;
533 /* Collect used fragments while replacing them in the HW descriptors */
534 for (nr
= 0; nr
< priv
->num_frags
; nr
++) {
535 frag_info
= &priv
->frag_info
[nr
];
536 if (length
<= frag_info
->frag_prefix_size
)
541 dma
= be64_to_cpu(rx_desc
->data
[nr
].addr
);
542 dma_sync_single_for_cpu(priv
->ddev
, dma
, frag_info
->frag_size
,
545 /* Save page reference in skb */
546 __skb_frag_set_page(&skb_frags_rx
[nr
], frags
[nr
].page
);
547 skb_frag_size_set(&skb_frags_rx
[nr
], frag_info
->frag_size
);
548 skb_frags_rx
[nr
].page_offset
= frags
[nr
].page_offset
;
549 skb
->truesize
+= frag_info
->frag_stride
;
550 frags
[nr
].page
= NULL
;
552 /* Adjust size of last fragment to match actual length */
554 skb_frag_size_set(&skb_frags_rx
[nr
- 1],
555 length
- priv
->frag_info
[nr
- 1].frag_prefix_size
);
561 __skb_frag_unref(&skb_frags_rx
[nr
]);
567 static struct sk_buff
*mlx4_en_rx_skb(struct mlx4_en_priv
*priv
,
568 struct mlx4_en_rx_desc
*rx_desc
,
569 struct mlx4_en_rx_alloc
*frags
,
577 skb
= netdev_alloc_skb(priv
->dev
, SMALL_PACKET_SIZE
+ NET_IP_ALIGN
);
579 en_dbg(RX_ERR
, priv
, "Failed allocating skb\n");
582 skb_reserve(skb
, NET_IP_ALIGN
);
585 /* Get pointer to first fragment so we could copy the headers into the
586 * (linear part of the) skb */
587 va
= page_address(frags
[0].page
) + frags
[0].page_offset
;
589 if (length
<= SMALL_PACKET_SIZE
) {
590 /* We are copying all relevant data to the skb - temporarily
591 * sync buffers for the copy */
592 dma
= be64_to_cpu(rx_desc
->data
[0].addr
);
593 dma_sync_single_for_cpu(priv
->ddev
, dma
, length
,
595 skb_copy_to_linear_data(skb
, va
, length
);
598 unsigned int pull_len
;
600 /* Move relevant fragments to skb */
601 used_frags
= mlx4_en_complete_rx_desc(priv
, rx_desc
, frags
,
603 if (unlikely(!used_frags
)) {
607 skb_shinfo(skb
)->nr_frags
= used_frags
;
609 pull_len
= eth_get_headlen(va
, SMALL_PACKET_SIZE
);
610 /* Copy headers into the skb linear buffer */
611 memcpy(skb
->data
, va
, pull_len
);
612 skb
->tail
+= pull_len
;
614 /* Skip headers in first fragment */
615 skb_shinfo(skb
)->frags
[0].page_offset
+= pull_len
;
617 /* Adjust size of first fragment */
618 skb_frag_size_sub(&skb_shinfo(skb
)->frags
[0], pull_len
);
619 skb
->data_len
= length
- pull_len
;
624 static void validate_loopback(struct mlx4_en_priv
*priv
, struct sk_buff
*skb
)
627 int offset
= ETH_HLEN
;
629 for (i
= 0; i
< MLX4_LOOPBACK_TEST_PAYLOAD
; i
++, offset
++) {
630 if (*(skb
->data
+ offset
) != (unsigned char) (i
& 0xff))
634 priv
->loopback_ok
= 1;
637 dev_kfree_skb_any(skb
);
640 static void mlx4_en_refill_rx_buffers(struct mlx4_en_priv
*priv
,
641 struct mlx4_en_rx_ring
*ring
)
643 int index
= ring
->prod
& ring
->size_mask
;
645 while ((u32
) (ring
->prod
- ring
->cons
) < ring
->actual_size
) {
646 if (mlx4_en_prepare_rx_desc(priv
, ring
, index
,
647 GFP_ATOMIC
| __GFP_COLD
))
650 index
= ring
->prod
& ring
->size_mask
;
654 /* When hardware doesn't strip the vlan, we need to calculate the checksum
655 * over it and add it to the hardware's checksum calculation
657 static inline __wsum
get_fixed_vlan_csum(__wsum hw_checksum
,
658 struct vlan_hdr
*vlanh
)
660 return csum_add(hw_checksum
, *(__wsum
*)vlanh
);
663 /* Although the stack expects checksum which doesn't include the pseudo
664 * header, the HW adds it. To address that, we are subtracting the pseudo
665 * header checksum from the checksum value provided by the HW.
667 static void get_fixed_ipv4_csum(__wsum hw_checksum
, struct sk_buff
*skb
,
670 __u16 length_for_csum
= 0;
671 __wsum csum_pseudo_header
= 0;
673 length_for_csum
= (be16_to_cpu(iph
->tot_len
) - (iph
->ihl
<< 2));
674 csum_pseudo_header
= csum_tcpudp_nofold(iph
->saddr
, iph
->daddr
,
675 length_for_csum
, iph
->protocol
, 0);
676 skb
->csum
= csum_sub(hw_checksum
, csum_pseudo_header
);
679 #if IS_ENABLED(CONFIG_IPV6)
680 /* In IPv6 packets, besides subtracting the pseudo header checksum,
681 * we also compute/add the IP header checksum which
682 * is not added by the HW.
684 static int get_fixed_ipv6_csum(__wsum hw_checksum
, struct sk_buff
*skb
,
685 struct ipv6hdr
*ipv6h
)
687 __wsum csum_pseudo_hdr
= 0;
689 if (ipv6h
->nexthdr
== IPPROTO_FRAGMENT
|| ipv6h
->nexthdr
== IPPROTO_HOPOPTS
)
691 hw_checksum
= csum_add(hw_checksum
, (__force __wsum
)(ipv6h
->nexthdr
<< 8));
693 csum_pseudo_hdr
= csum_partial(&ipv6h
->saddr
,
694 sizeof(ipv6h
->saddr
) + sizeof(ipv6h
->daddr
), 0);
695 csum_pseudo_hdr
= csum_add(csum_pseudo_hdr
, (__force __wsum
)ipv6h
->payload_len
);
696 csum_pseudo_hdr
= csum_add(csum_pseudo_hdr
, (__force __wsum
)ntohs(ipv6h
->nexthdr
));
698 skb
->csum
= csum_sub(hw_checksum
, csum_pseudo_hdr
);
699 skb
->csum
= csum_add(skb
->csum
, csum_partial(ipv6h
, sizeof(struct ipv6hdr
), 0));
703 static int check_csum(struct mlx4_cqe
*cqe
, struct sk_buff
*skb
, void *va
,
704 int hwtstamp_rx_filter
)
706 __wsum hw_checksum
= 0;
708 void *hdr
= (u8
*)va
+ sizeof(struct ethhdr
);
710 hw_checksum
= csum_unfold((__force __sum16
)cqe
->checksum
);
712 if (((struct ethhdr
*)va
)->h_proto
== htons(ETH_P_8021Q
) &&
713 hwtstamp_rx_filter
!= HWTSTAMP_FILTER_NONE
) {
714 /* next protocol non IPv4 or IPv6 */
715 if (((struct vlan_hdr
*)hdr
)->h_vlan_encapsulated_proto
716 != htons(ETH_P_IP
) &&
717 ((struct vlan_hdr
*)hdr
)->h_vlan_encapsulated_proto
718 != htons(ETH_P_IPV6
))
720 hw_checksum
= get_fixed_vlan_csum(hw_checksum
, hdr
);
721 hdr
+= sizeof(struct vlan_hdr
);
724 if (cqe
->status
& cpu_to_be16(MLX4_CQE_STATUS_IPV4
))
725 get_fixed_ipv4_csum(hw_checksum
, skb
, hdr
);
726 #if IS_ENABLED(CONFIG_IPV6)
727 else if (cqe
->status
& cpu_to_be16(MLX4_CQE_STATUS_IPV6
))
728 if (get_fixed_ipv6_csum(hw_checksum
, skb
, hdr
))
734 int mlx4_en_process_rx_cq(struct net_device
*dev
, struct mlx4_en_cq
*cq
, int budget
)
736 struct mlx4_en_priv
*priv
= netdev_priv(dev
);
737 struct mlx4_en_dev
*mdev
= priv
->mdev
;
738 struct mlx4_cqe
*cqe
;
739 struct mlx4_en_rx_ring
*ring
= priv
->rx_ring
[cq
->ring
];
740 struct mlx4_en_rx_alloc
*frags
;
741 struct mlx4_en_rx_desc
*rx_desc
;
748 int factor
= priv
->cqe_factor
;
758 /* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
759 * descriptor offset can be deduced from the CQE index instead of
760 * reading 'cqe->index' */
761 index
= cq
->mcq
.cons_index
& ring
->size_mask
;
762 cqe
= mlx4_en_get_cqe(cq
->buf
, index
, priv
->cqe_size
) + factor
;
764 /* Process all completed CQEs */
765 while (XNOR(cqe
->owner_sr_opcode
& MLX4_CQE_OWNER_MASK
,
766 cq
->mcq
.cons_index
& cq
->size
)) {
768 frags
= ring
->rx_info
+ (index
<< priv
->log_rx_info
);
769 rx_desc
= ring
->buf
+ (index
<< ring
->log_stride
);
772 * make sure we read the CQE after we read the ownership bit
776 /* Drop packet on bad receive or bad checksum */
777 if (unlikely((cqe
->owner_sr_opcode
& MLX4_CQE_OPCODE_MASK
) ==
778 MLX4_CQE_OPCODE_ERROR
)) {
779 en_err(priv
, "CQE completed in error - vendor syndrom:%d syndrom:%d\n",
780 ((struct mlx4_err_cqe
*)cqe
)->vendor_err_syndrome
,
781 ((struct mlx4_err_cqe
*)cqe
)->syndrome
);
784 if (unlikely(cqe
->badfcs_enc
& MLX4_CQE_BAD_FCS
)) {
785 en_dbg(RX_ERR
, priv
, "Accepted frame with bad FCS\n");
789 /* Check if we need to drop the packet if SRIOV is not enabled
790 * and not performing the selftest or flb disabled
792 if (priv
->flags
& MLX4_EN_FLAG_RX_FILTER_NEEDED
) {
795 /* Get pointer to first fragment since we haven't
796 * skb yet and cast it to ethhdr struct
798 dma
= be64_to_cpu(rx_desc
->data
[0].addr
);
799 dma_sync_single_for_cpu(priv
->ddev
, dma
, sizeof(*ethh
),
801 ethh
= (struct ethhdr
*)(page_address(frags
[0].page
) +
802 frags
[0].page_offset
);
804 if (is_multicast_ether_addr(ethh
->h_dest
)) {
805 struct mlx4_mac_entry
*entry
;
806 struct hlist_head
*bucket
;
807 unsigned int mac_hash
;
809 /* Drop the packet, since HW loopback-ed it */
810 mac_hash
= ethh
->h_source
[MLX4_EN_MAC_HASH_IDX
];
811 bucket
= &priv
->mac_hash
[mac_hash
];
813 hlist_for_each_entry_rcu(entry
, bucket
, hlist
) {
814 if (ether_addr_equal_64bits(entry
->mac
,
825 * Packet is OK - process it.
827 length
= be32_to_cpu(cqe
->byte_cnt
);
828 length
-= ring
->fcs_del
;
829 ring
->bytes
+= length
;
831 l2_tunnel
= (dev
->hw_enc_features
& NETIF_F_RXCSUM
) &&
832 (cqe
->vlan_my_qpn
& cpu_to_be32(MLX4_CQE_L2_TUNNEL
));
834 if (likely(dev
->features
& NETIF_F_RXCSUM
)) {
835 if (cqe
->status
& cpu_to_be16(MLX4_CQE_STATUS_TCP
|
836 MLX4_CQE_STATUS_UDP
)) {
837 if ((cqe
->status
& cpu_to_be16(MLX4_CQE_STATUS_IPOK
)) &&
838 cqe
->checksum
== cpu_to_be16(0xffff)) {
839 ip_summed
= CHECKSUM_UNNECESSARY
;
842 ip_summed
= CHECKSUM_NONE
;
846 if (priv
->flags
& MLX4_EN_FLAG_RX_CSUM_NON_TCP_UDP
&&
847 (cqe
->status
& cpu_to_be16(MLX4_CQE_STATUS_IPV4
|
848 MLX4_CQE_STATUS_IPV6
))) {
849 ip_summed
= CHECKSUM_COMPLETE
;
850 ring
->csum_complete
++;
852 ip_summed
= CHECKSUM_NONE
;
857 ip_summed
= CHECKSUM_NONE
;
861 /* This packet is eligible for GRO if it is:
862 * - DIX Ethernet (type interpretation)
864 * - without IP options
865 * - not an IP fragment
866 * - no LLS polling in progress
868 if (!mlx4_en_cq_busy_polling(cq
) &&
869 (dev
->features
& NETIF_F_GRO
)) {
870 struct sk_buff
*gro_skb
= napi_get_frags(&cq
->napi
);
874 nr
= mlx4_en_complete_rx_desc(priv
,
875 rx_desc
, frags
, gro_skb
,
880 if (ip_summed
== CHECKSUM_COMPLETE
) {
881 void *va
= skb_frag_address(skb_shinfo(gro_skb
)->frags
);
882 if (check_csum(cqe
, gro_skb
, va
, ring
->hwtstamp_rx_filter
)) {
883 ip_summed
= CHECKSUM_NONE
;
885 ring
->csum_complete
--;
889 skb_shinfo(gro_skb
)->nr_frags
= nr
;
890 gro_skb
->len
= length
;
891 gro_skb
->data_len
= length
;
892 gro_skb
->ip_summed
= ip_summed
;
894 if (l2_tunnel
&& ip_summed
== CHECKSUM_UNNECESSARY
)
895 gro_skb
->csum_level
= 1;
897 if ((cqe
->vlan_my_qpn
&
898 cpu_to_be32(MLX4_CQE_VLAN_PRESENT_MASK
)) &&
899 (dev
->features
& NETIF_F_HW_VLAN_CTAG_RX
)) {
900 u16 vid
= be16_to_cpu(cqe
->sl_vid
);
902 __vlan_hwaccel_put_tag(gro_skb
, htons(ETH_P_8021Q
), vid
);
905 if (dev
->features
& NETIF_F_RXHASH
)
906 skb_set_hash(gro_skb
,
907 be32_to_cpu(cqe
->immed_rss_invalid
),
910 skb_record_rx_queue(gro_skb
, cq
->ring
);
911 skb_mark_napi_id(gro_skb
, &cq
->napi
);
913 if (ring
->hwtstamp_rx_filter
== HWTSTAMP_FILTER_ALL
) {
914 timestamp
= mlx4_en_get_cqe_ts(cqe
);
915 mlx4_en_fill_hwtstamps(mdev
,
916 skb_hwtstamps(gro_skb
),
920 napi_gro_frags(&cq
->napi
);
924 /* GRO not possible, complete processing here */
925 skb
= mlx4_en_rx_skb(priv
, rx_desc
, frags
, length
);
927 priv
->stats
.rx_dropped
++;
931 if (unlikely(priv
->validate_loopback
)) {
932 validate_loopback(priv
, skb
);
936 if (ip_summed
== CHECKSUM_COMPLETE
) {
937 if (check_csum(cqe
, skb
, skb
->data
, ring
->hwtstamp_rx_filter
)) {
938 ip_summed
= CHECKSUM_NONE
;
939 ring
->csum_complete
--;
944 skb
->ip_summed
= ip_summed
;
945 skb
->protocol
= eth_type_trans(skb
, dev
);
946 skb_record_rx_queue(skb
, cq
->ring
);
948 if (l2_tunnel
&& ip_summed
== CHECKSUM_UNNECESSARY
)
951 if (dev
->features
& NETIF_F_RXHASH
)
953 be32_to_cpu(cqe
->immed_rss_invalid
),
956 if ((be32_to_cpu(cqe
->vlan_my_qpn
) &
957 MLX4_CQE_VLAN_PRESENT_MASK
) &&
958 (dev
->features
& NETIF_F_HW_VLAN_CTAG_RX
))
959 __vlan_hwaccel_put_tag(skb
, htons(ETH_P_8021Q
), be16_to_cpu(cqe
->sl_vid
));
961 if (ring
->hwtstamp_rx_filter
== HWTSTAMP_FILTER_ALL
) {
962 timestamp
= mlx4_en_get_cqe_ts(cqe
);
963 mlx4_en_fill_hwtstamps(mdev
, skb_hwtstamps(skb
),
967 skb_mark_napi_id(skb
, &cq
->napi
);
969 if (!mlx4_en_cq_busy_polling(cq
))
970 napi_gro_receive(&cq
->napi
, skb
);
972 netif_receive_skb(skb
);
975 for (nr
= 0; nr
< priv
->num_frags
; nr
++)
976 mlx4_en_free_frag(priv
, frags
, nr
);
978 ++cq
->mcq
.cons_index
;
979 index
= (cq
->mcq
.cons_index
) & ring
->size_mask
;
980 cqe
= mlx4_en_get_cqe(cq
->buf
, index
, priv
->cqe_size
) + factor
;
981 if (++polled
== budget
)
986 AVG_PERF_COUNTER(priv
->pstats
.rx_coal_avg
, polled
);
987 mlx4_cq_set_ci(&cq
->mcq
);
988 wmb(); /* ensure HW sees CQ consumer before we post new buffers */
989 ring
->cons
= cq
->mcq
.cons_index
;
990 mlx4_en_refill_rx_buffers(priv
, ring
);
991 mlx4_en_update_rx_prod_db(ring
);
996 void mlx4_en_rx_irq(struct mlx4_cq
*mcq
)
998 struct mlx4_en_cq
*cq
= container_of(mcq
, struct mlx4_en_cq
, mcq
);
999 struct mlx4_en_priv
*priv
= netdev_priv(cq
->dev
);
1001 if (likely(priv
->port_up
))
1002 napi_schedule_irqoff(&cq
->napi
);
1004 mlx4_en_arm_cq(priv
, cq
);
1007 /* Rx CQ polling - called by NAPI */
1008 int mlx4_en_poll_rx_cq(struct napi_struct
*napi
, int budget
)
1010 struct mlx4_en_cq
*cq
= container_of(napi
, struct mlx4_en_cq
, napi
);
1011 struct net_device
*dev
= cq
->dev
;
1012 struct mlx4_en_priv
*priv
= netdev_priv(dev
);
1015 if (!mlx4_en_cq_lock_napi(cq
))
1018 done
= mlx4_en_process_rx_cq(dev
, cq
, budget
);
1020 mlx4_en_cq_unlock_napi(cq
);
1022 /* If we used up all the quota - we're probably not done yet... */
1023 if (done
== budget
) {
1025 const struct cpumask
*aff
;
1027 INC_PERF_COUNTER(priv
->pstats
.napi_quota
);
1029 cpu_curr
= smp_processor_id();
1030 aff
= irq_desc_get_irq_data(cq
->irq_desc
)->affinity
;
1032 if (likely(cpumask_test_cpu(cpu_curr
, aff
)))
1035 /* Current cpu is not according to smp_irq_affinity -
1036 * probably affinity changed. need to stop this NAPI
1037 * poll, and restart it on the right CPU
1042 napi_complete_done(napi
, done
);
1043 mlx4_en_arm_cq(priv
, cq
);
1047 static const int frag_sizes
[] = {
1054 void mlx4_en_calc_rx_buf(struct net_device
*dev
)
1056 struct mlx4_en_priv
*priv
= netdev_priv(dev
);
1057 int eff_mtu
= dev
->mtu
+ ETH_HLEN
+ VLAN_HLEN
;
1061 while (buf_size
< eff_mtu
) {
1062 priv
->frag_info
[i
].frag_size
=
1063 (eff_mtu
> buf_size
+ frag_sizes
[i
]) ?
1064 frag_sizes
[i
] : eff_mtu
- buf_size
;
1065 priv
->frag_info
[i
].frag_prefix_size
= buf_size
;
1066 priv
->frag_info
[i
].frag_stride
=
1067 ALIGN(priv
->frag_info
[i
].frag_size
,
1069 buf_size
+= priv
->frag_info
[i
].frag_size
;
1073 priv
->num_frags
= i
;
1074 priv
->rx_skb_size
= eff_mtu
;
1075 priv
->log_rx_info
= ROUNDUP_LOG2(i
* sizeof(struct mlx4_en_rx_alloc
));
1077 en_dbg(DRV
, priv
, "Rx buffer scatter-list (effective-mtu:%d num_frags:%d):\n",
1078 eff_mtu
, priv
->num_frags
);
1079 for (i
= 0; i
< priv
->num_frags
; i
++) {
1081 " frag:%d - size:%d prefix:%d stride:%d\n",
1083 priv
->frag_info
[i
].frag_size
,
1084 priv
->frag_info
[i
].frag_prefix_size
,
1085 priv
->frag_info
[i
].frag_stride
);
1089 /* RSS related functions */
1091 static int mlx4_en_config_rss_qp(struct mlx4_en_priv
*priv
, int qpn
,
1092 struct mlx4_en_rx_ring
*ring
,
1093 enum mlx4_qp_state
*state
,
1096 struct mlx4_en_dev
*mdev
= priv
->mdev
;
1097 struct mlx4_qp_context
*context
;
1100 context
= kmalloc(sizeof(*context
), GFP_KERNEL
);
1104 err
= mlx4_qp_alloc(mdev
->dev
, qpn
, qp
, GFP_KERNEL
);
1106 en_err(priv
, "Failed to allocate qp #%x\n", qpn
);
1109 qp
->event
= mlx4_en_sqp_event
;
1111 memset(context
, 0, sizeof *context
);
1112 mlx4_en_fill_qp_context(priv
, ring
->actual_size
, ring
->stride
, 0, 0,
1113 qpn
, ring
->cqn
, -1, context
);
1114 context
->db_rec_addr
= cpu_to_be64(ring
->wqres
.db
.dma
);
1116 /* Cancel FCS removal if FW allows */
1117 if (mdev
->dev
->caps
.flags
& MLX4_DEV_CAP_FLAG_FCS_KEEP
) {
1118 context
->param3
|= cpu_to_be32(1 << 29);
1119 ring
->fcs_del
= ETH_FCS_LEN
;
1123 err
= mlx4_qp_to_ready(mdev
->dev
, &ring
->wqres
.mtt
, context
, qp
, state
);
1125 mlx4_qp_remove(mdev
->dev
, qp
);
1126 mlx4_qp_free(mdev
->dev
, qp
);
1128 mlx4_en_update_rx_prod_db(ring
);
1134 int mlx4_en_create_drop_qp(struct mlx4_en_priv
*priv
)
1139 err
= mlx4_qp_reserve_range(priv
->mdev
->dev
, 1, 1, &qpn
,
1140 MLX4_RESERVE_A0_QP
);
1142 en_err(priv
, "Failed reserving drop qpn\n");
1145 err
= mlx4_qp_alloc(priv
->mdev
->dev
, qpn
, &priv
->drop_qp
, GFP_KERNEL
);
1147 en_err(priv
, "Failed allocating drop qp\n");
1148 mlx4_qp_release_range(priv
->mdev
->dev
, qpn
, 1);
1155 void mlx4_en_destroy_drop_qp(struct mlx4_en_priv
*priv
)
1159 qpn
= priv
->drop_qp
.qpn
;
1160 mlx4_qp_remove(priv
->mdev
->dev
, &priv
->drop_qp
);
1161 mlx4_qp_free(priv
->mdev
->dev
, &priv
->drop_qp
);
1162 mlx4_qp_release_range(priv
->mdev
->dev
, qpn
, 1);
1165 /* Allocate rx qp's and configure them according to rss map */
1166 int mlx4_en_config_rss_steer(struct mlx4_en_priv
*priv
)
1168 struct mlx4_en_dev
*mdev
= priv
->mdev
;
1169 struct mlx4_en_rss_map
*rss_map
= &priv
->rss_map
;
1170 struct mlx4_qp_context context
;
1171 struct mlx4_rss_context
*rss_context
;
1174 u8 rss_mask
= (MLX4_RSS_IPV4
| MLX4_RSS_TCP_IPV4
| MLX4_RSS_IPV6
|
1180 en_dbg(DRV
, priv
, "Configuring rss steering\n");
1181 err
= mlx4_qp_reserve_range(mdev
->dev
, priv
->rx_ring_num
,
1183 &rss_map
->base_qpn
, 0);
1185 en_err(priv
, "Failed reserving %d qps\n", priv
->rx_ring_num
);
1189 for (i
= 0; i
< priv
->rx_ring_num
; i
++) {
1190 qpn
= rss_map
->base_qpn
+ i
;
1191 err
= mlx4_en_config_rss_qp(priv
, qpn
, priv
->rx_ring
[i
],
1200 /* Configure RSS indirection qp */
1201 err
= mlx4_qp_alloc(mdev
->dev
, priv
->base_qpn
, &rss_map
->indir_qp
, GFP_KERNEL
);
1203 en_err(priv
, "Failed to allocate RSS indirection QP\n");
1206 rss_map
->indir_qp
.event
= mlx4_en_sqp_event
;
1207 mlx4_en_fill_qp_context(priv
, 0, 0, 0, 1, priv
->base_qpn
,
1208 priv
->rx_ring
[0]->cqn
, -1, &context
);
1210 if (!priv
->prof
->rss_rings
|| priv
->prof
->rss_rings
> priv
->rx_ring_num
)
1211 rss_rings
= priv
->rx_ring_num
;
1213 rss_rings
= priv
->prof
->rss_rings
;
1215 ptr
= ((void *) &context
) + offsetof(struct mlx4_qp_context
, pri_path
)
1216 + MLX4_RSS_OFFSET_IN_QPC_PRI_PATH
;
1218 rss_context
->base_qpn
= cpu_to_be32(ilog2(rss_rings
) << 24 |
1219 (rss_map
->base_qpn
));
1220 rss_context
->default_qpn
= cpu_to_be32(rss_map
->base_qpn
);
1221 if (priv
->mdev
->profile
.udp_rss
) {
1222 rss_mask
|= MLX4_RSS_UDP_IPV4
| MLX4_RSS_UDP_IPV6
;
1223 rss_context
->base_qpn_udp
= rss_context
->default_qpn
;
1226 if (mdev
->dev
->caps
.tunnel_offload_mode
== MLX4_TUNNEL_OFFLOAD_MODE_VXLAN
) {
1227 en_info(priv
, "Setting RSS context tunnel type to RSS on inner headers\n");
1228 rss_mask
|= MLX4_RSS_BY_INNER_HEADERS
;
1231 rss_context
->flags
= rss_mask
;
1232 rss_context
->hash_fn
= MLX4_RSS_HASH_TOP
;
1233 if (priv
->rss_hash_fn
== ETH_RSS_HASH_XOR
) {
1234 rss_context
->hash_fn
= MLX4_RSS_HASH_XOR
;
1235 } else if (priv
->rss_hash_fn
== ETH_RSS_HASH_TOP
) {
1236 rss_context
->hash_fn
= MLX4_RSS_HASH_TOP
;
1237 memcpy(rss_context
->rss_key
, priv
->rss_key
,
1238 MLX4_EN_RSS_KEY_SIZE
);
1239 netdev_rss_key_fill(rss_context
->rss_key
,
1240 MLX4_EN_RSS_KEY_SIZE
);
1242 en_err(priv
, "Unknown RSS hash function requested\n");
1246 err
= mlx4_qp_to_ready(mdev
->dev
, &priv
->res
.mtt
, &context
,
1247 &rss_map
->indir_qp
, &rss_map
->indir_state
);
1254 mlx4_qp_modify(mdev
->dev
, NULL
, rss_map
->indir_state
,
1255 MLX4_QP_STATE_RST
, NULL
, 0, 0, &rss_map
->indir_qp
);
1256 mlx4_qp_remove(mdev
->dev
, &rss_map
->indir_qp
);
1257 mlx4_qp_free(mdev
->dev
, &rss_map
->indir_qp
);
1259 for (i
= 0; i
< good_qps
; i
++) {
1260 mlx4_qp_modify(mdev
->dev
, NULL
, rss_map
->state
[i
],
1261 MLX4_QP_STATE_RST
, NULL
, 0, 0, &rss_map
->qps
[i
]);
1262 mlx4_qp_remove(mdev
->dev
, &rss_map
->qps
[i
]);
1263 mlx4_qp_free(mdev
->dev
, &rss_map
->qps
[i
]);
1265 mlx4_qp_release_range(mdev
->dev
, rss_map
->base_qpn
, priv
->rx_ring_num
);
1269 void mlx4_en_release_rss_steer(struct mlx4_en_priv
*priv
)
1271 struct mlx4_en_dev
*mdev
= priv
->mdev
;
1272 struct mlx4_en_rss_map
*rss_map
= &priv
->rss_map
;
1275 mlx4_qp_modify(mdev
->dev
, NULL
, rss_map
->indir_state
,
1276 MLX4_QP_STATE_RST
, NULL
, 0, 0, &rss_map
->indir_qp
);
1277 mlx4_qp_remove(mdev
->dev
, &rss_map
->indir_qp
);
1278 mlx4_qp_free(mdev
->dev
, &rss_map
->indir_qp
);
1280 for (i
= 0; i
< priv
->rx_ring_num
; i
++) {
1281 mlx4_qp_modify(mdev
->dev
, NULL
, rss_map
->state
[i
],
1282 MLX4_QP_STATE_RST
, NULL
, 0, 0, &rss_map
->qps
[i
]);
1283 mlx4_qp_remove(mdev
->dev
, &rss_map
->qps
[i
]);
1284 mlx4_qp_free(mdev
->dev
, &rss_map
->qps
[i
]);
1286 mlx4_qp_release_range(mdev
->dev
, rss_map
->base_qpn
, priv
->rx_ring_num
);