1 /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
2 /* Copyright (c) 2018 Mellanox Technologies. */
9 static int get_route_and_out_devs(struct mlx5e_priv
*priv
,
10 struct net_device
*dev
,
11 struct net_device
**route_dev
,
12 struct net_device
**out_dev
)
14 struct mlx5_eswitch
*esw
= priv
->mdev
->priv
.eswitch
;
15 struct net_device
*uplink_dev
, *uplink_upper
;
18 uplink_dev
= mlx5_eswitch_uplink_get_proto_dev(esw
, REP_ETH
);
19 uplink_upper
= netdev_master_upper_dev_get(uplink_dev
);
20 dst_is_lag_dev
= (uplink_upper
&&
21 netif_is_lag_master(uplink_upper
) &&
22 dev
== uplink_upper
&&
23 mlx5_lag_is_sriov(priv
->mdev
));
25 /* if the egress device isn't on the same HW e-switch or
26 * it's a LAG device, use the uplink
28 if (!switchdev_port_same_parent_id(priv
->netdev
, dev
) ||
30 *route_dev
= uplink_dev
;
31 *out_dev
= *route_dev
;
34 if (is_vlan_dev(*route_dev
))
35 *out_dev
= uplink_dev
;
36 else if (mlx5e_eswitch_rep(dev
))
37 *out_dev
= *route_dev
;
45 static int mlx5e_route_lookup_ipv4(struct mlx5e_priv
*priv
,
46 struct net_device
*mirred_dev
,
47 struct net_device
**out_dev
,
48 struct net_device
**route_dev
,
50 struct neighbour
**out_n
,
54 struct neighbour
*n
= NULL
;
56 #if IS_ENABLED(CONFIG_INET)
59 rt
= ip_route_output_key(dev_net(mirred_dev
), fl4
);
60 ret
= PTR_ERR_OR_ZERO(rt
);
67 ret
= get_route_and_out_devs(priv
, rt
->dst
.dev
, route_dev
, out_dev
);
72 *out_ttl
= ip4_dst_hoplimit(&rt
->dst
);
73 n
= dst_neigh_lookup(&rt
->dst
, &fl4
->daddr
);
82 static const char *mlx5e_netdev_kind(struct net_device
*dev
)
84 if (dev
->rtnl_link_ops
)
85 return dev
->rtnl_link_ops
->kind
;
90 static int mlx5e_route_lookup_ipv6(struct mlx5e_priv
*priv
,
91 struct net_device
*mirred_dev
,
92 struct net_device
**out_dev
,
93 struct net_device
**route_dev
,
95 struct neighbour
**out_n
,
98 struct neighbour
*n
= NULL
;
99 struct dst_entry
*dst
;
101 #if IS_ENABLED(CONFIG_INET) && IS_ENABLED(CONFIG_IPV6)
104 ret
= ipv6_stub
->ipv6_dst_lookup(dev_net(mirred_dev
), NULL
, &dst
,
110 *out_ttl
= ip6_dst_hoplimit(dst
);
112 ret
= get_route_and_out_devs(priv
, dst
->dev
, route_dev
, out_dev
);
119 n
= dst_neigh_lookup(dst
, &fl6
->daddr
);
128 static int mlx5e_gen_vxlan_header(char buf
[], struct ip_tunnel_key
*tun_key
)
130 __be32 tun_id
= tunnel_id_to_key32(tun_key
->tun_id
);
131 struct udphdr
*udp
= (struct udphdr
*)(buf
);
132 struct vxlanhdr
*vxh
= (struct vxlanhdr
*)
133 ((char *)udp
+ sizeof(struct udphdr
));
135 udp
->dest
= tun_key
->tp_dst
;
136 vxh
->vx_flags
= VXLAN_HF_VNI
;
137 vxh
->vx_vni
= vxlan_vni_field(tun_id
);
142 static int mlx5e_gen_gre_header(char buf
[], struct ip_tunnel_key
*tun_key
)
144 __be32 tun_id
= tunnel_id_to_key32(tun_key
->tun_id
);
146 struct gre_base_hdr
*greh
= (struct gre_base_hdr
*)(buf
);
148 /* the HW does not calculate GRE csum or sequences */
149 if (tun_key
->tun_flags
& (TUNNEL_CSUM
| TUNNEL_SEQ
))
152 greh
->protocol
= htons(ETH_P_TEB
);
155 hdr_len
= gre_calc_hlen(tun_key
->tun_flags
);
156 greh
->flags
= gre_tnl_flags_to_gre_flags(tun_key
->tun_flags
);
157 if (tun_key
->tun_flags
& TUNNEL_KEY
) {
158 __be32
*ptr
= (__be32
*)(((u8
*)greh
) + hdr_len
- 4);
166 static int mlx5e_gen_ip_tunnel_header(char buf
[], __u8
*ip_proto
,
167 struct mlx5e_encap_entry
*e
)
170 struct ip_tunnel_key
*key
= &e
->tun_info
.key
;
172 if (e
->tunnel_type
== MLX5E_TC_TUNNEL_TYPE_VXLAN
) {
173 *ip_proto
= IPPROTO_UDP
;
174 err
= mlx5e_gen_vxlan_header(buf
, key
);
175 } else if (e
->tunnel_type
== MLX5E_TC_TUNNEL_TYPE_GRETAP
) {
176 *ip_proto
= IPPROTO_GRE
;
177 err
= mlx5e_gen_gre_header(buf
, key
);
179 pr_warn("mlx5: Cannot generate tunnel header for tunnel type (%d)\n"
187 static char *gen_eth_tnl_hdr(char *buf
, struct net_device
*dev
,
188 struct mlx5e_encap_entry
*e
,
191 struct ethhdr
*eth
= (struct ethhdr
*)buf
;
194 ether_addr_copy(eth
->h_dest
, e
->h_dest
);
195 ether_addr_copy(eth
->h_source
, dev
->dev_addr
);
196 if (is_vlan_dev(dev
)) {
197 struct vlan_hdr
*vlan
= (struct vlan_hdr
*)
198 ((char *)eth
+ ETH_HLEN
);
199 ip
= (char *)vlan
+ VLAN_HLEN
;
200 eth
->h_proto
= vlan_dev_vlan_proto(dev
);
201 vlan
->h_vlan_TCI
= htons(vlan_dev_vlan_id(dev
));
202 vlan
->h_vlan_encapsulated_proto
= htons(proto
);
204 eth
->h_proto
= htons(proto
);
205 ip
= (char *)eth
+ ETH_HLEN
;
211 int mlx5e_tc_tun_create_header_ipv4(struct mlx5e_priv
*priv
,
212 struct net_device
*mirred_dev
,
213 struct mlx5e_encap_entry
*e
)
215 int max_encap_size
= MLX5_CAP_ESW(priv
->mdev
, max_encap_header_size
);
216 struct ip_tunnel_key
*tun_key
= &e
->tun_info
.key
;
217 struct net_device
*out_dev
, *route_dev
;
218 struct neighbour
*n
= NULL
;
219 struct flowi4 fl4
= {};
226 /* add the IP fields */
227 fl4
.flowi4_tos
= tun_key
->tos
;
228 fl4
.daddr
= tun_key
->u
.ipv4
.dst
;
229 fl4
.saddr
= tun_key
->u
.ipv4
.src
;
232 err
= mlx5e_route_lookup_ipv4(priv
, mirred_dev
, &out_dev
, &route_dev
,
238 (is_vlan_dev(route_dev
) ? VLAN_ETH_HLEN
: ETH_HLEN
) +
239 sizeof(struct iphdr
) +
242 if (max_encap_size
< ipv4_encap_size
) {
243 mlx5_core_warn(priv
->mdev
, "encap size %d too big, max supported is %d\n",
244 ipv4_encap_size
, max_encap_size
);
248 encap_header
= kzalloc(ipv4_encap_size
, GFP_KERNEL
);
252 /* used by mlx5e_detach_encap to lookup a neigh hash table
253 * entry in the neigh hash table when a user deletes a rule
255 e
->m_neigh
.dev
= n
->dev
;
256 e
->m_neigh
.family
= n
->ops
->family
;
257 memcpy(&e
->m_neigh
.dst_ip
, n
->primary_key
, n
->tbl
->key_len
);
258 e
->out_dev
= out_dev
;
260 /* It's important to add the neigh to the hash table before checking
261 * the neigh validity state. So if we'll get a notification, in case the
262 * neigh changes it's validity state, we would find the relevant neigh
265 err
= mlx5e_rep_encap_entry_attach(netdev_priv(out_dev
), e
);
269 read_lock_bh(&n
->lock
);
270 nud_state
= n
->nud_state
;
271 ether_addr_copy(e
->h_dest
, n
->ha
);
272 read_unlock_bh(&n
->lock
);
274 /* add ethernet header */
275 ip
= (struct iphdr
*)gen_eth_tnl_hdr(encap_header
, route_dev
, e
,
279 ip
->tos
= tun_key
->tos
;
283 ip
->daddr
= fl4
.daddr
;
284 ip
->saddr
= fl4
.saddr
;
286 /* add tunneling protocol header */
287 err
= mlx5e_gen_ip_tunnel_header((char *)ip
+ sizeof(struct iphdr
),
290 goto destroy_neigh_entry
;
292 e
->encap_size
= ipv4_encap_size
;
293 e
->encap_header
= encap_header
;
295 if (!(nud_state
& NUD_VALID
)) {
296 neigh_event_send(n
, NULL
);
301 err
= mlx5_packet_reformat_alloc(priv
->mdev
,
303 ipv4_encap_size
, encap_header
,
304 MLX5_FLOW_NAMESPACE_FDB
,
307 goto destroy_neigh_entry
;
309 e
->flags
|= MLX5_ENCAP_ENTRY_VALID
;
310 mlx5e_rep_queue_neigh_stats_work(netdev_priv(out_dev
));
315 mlx5e_rep_encap_entry_detach(netdev_priv(e
->out_dev
), e
);
324 int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv
*priv
,
325 struct net_device
*mirred_dev
,
326 struct mlx5e_encap_entry
*e
)
328 int max_encap_size
= MLX5_CAP_ESW(priv
->mdev
, max_encap_header_size
);
329 struct ip_tunnel_key
*tun_key
= &e
->tun_info
.key
;
330 struct net_device
*out_dev
, *route_dev
;
331 struct neighbour
*n
= NULL
;
332 struct flowi6 fl6
= {};
333 struct ipv6hdr
*ip6h
;
341 fl6
.flowlabel
= ip6_make_flowinfo(RT_TOS(tun_key
->tos
), tun_key
->label
);
342 fl6
.daddr
= tun_key
->u
.ipv6
.dst
;
343 fl6
.saddr
= tun_key
->u
.ipv6
.src
;
345 err
= mlx5e_route_lookup_ipv6(priv
, mirred_dev
, &out_dev
, &route_dev
,
351 (is_vlan_dev(route_dev
) ? VLAN_ETH_HLEN
: ETH_HLEN
) +
352 sizeof(struct ipv6hdr
) +
355 if (max_encap_size
< ipv6_encap_size
) {
356 mlx5_core_warn(priv
->mdev
, "encap size %d too big, max supported is %d\n",
357 ipv6_encap_size
, max_encap_size
);
361 encap_header
= kzalloc(ipv6_encap_size
, GFP_KERNEL
);
365 /* used by mlx5e_detach_encap to lookup a neigh hash table
366 * entry in the neigh hash table when a user deletes a rule
368 e
->m_neigh
.dev
= n
->dev
;
369 e
->m_neigh
.family
= n
->ops
->family
;
370 memcpy(&e
->m_neigh
.dst_ip
, n
->primary_key
, n
->tbl
->key_len
);
371 e
->out_dev
= out_dev
;
373 /* It's importent to add the neigh to the hash table before checking
374 * the neigh validity state. So if we'll get a notification, in case the
375 * neigh changes it's validity state, we would find the relevant neigh
378 err
= mlx5e_rep_encap_entry_attach(netdev_priv(out_dev
), e
);
382 read_lock_bh(&n
->lock
);
383 nud_state
= n
->nud_state
;
384 ether_addr_copy(e
->h_dest
, n
->ha
);
385 read_unlock_bh(&n
->lock
);
387 /* add ethernet header */
388 ip6h
= (struct ipv6hdr
*)gen_eth_tnl_hdr(encap_header
, route_dev
, e
,
392 ip6_flow_hdr(ip6h
, tun_key
->tos
, 0);
393 /* the HW fills up ipv6 payload len */
394 ip6h
->hop_limit
= ttl
;
395 ip6h
->daddr
= fl6
.daddr
;
396 ip6h
->saddr
= fl6
.saddr
;
398 /* add tunneling protocol header */
399 err
= mlx5e_gen_ip_tunnel_header((char *)ip6h
+ sizeof(struct ipv6hdr
),
402 goto destroy_neigh_entry
;
404 e
->encap_size
= ipv6_encap_size
;
405 e
->encap_header
= encap_header
;
407 if (!(nud_state
& NUD_VALID
)) {
408 neigh_event_send(n
, NULL
);
413 err
= mlx5_packet_reformat_alloc(priv
->mdev
,
415 ipv6_encap_size
, encap_header
,
416 MLX5_FLOW_NAMESPACE_FDB
,
419 goto destroy_neigh_entry
;
421 e
->flags
|= MLX5_ENCAP_ENTRY_VALID
;
422 mlx5e_rep_queue_neigh_stats_work(netdev_priv(out_dev
));
427 mlx5e_rep_encap_entry_detach(netdev_priv(e
->out_dev
), e
);
436 int mlx5e_tc_tun_get_type(struct net_device
*tunnel_dev
)
438 if (netif_is_vxlan(tunnel_dev
))
439 return MLX5E_TC_TUNNEL_TYPE_VXLAN
;
440 else if (netif_is_gretap(tunnel_dev
) ||
441 netif_is_ip6gretap(tunnel_dev
))
442 return MLX5E_TC_TUNNEL_TYPE_GRETAP
;
444 return MLX5E_TC_TUNNEL_TYPE_UNKNOWN
;
447 bool mlx5e_tc_tun_device_to_offload(struct mlx5e_priv
*priv
,
448 struct net_device
*netdev
)
450 int tunnel_type
= mlx5e_tc_tun_get_type(netdev
);
452 if (tunnel_type
== MLX5E_TC_TUNNEL_TYPE_VXLAN
&&
453 MLX5_CAP_ESW(priv
->mdev
, vxlan_encap_decap
))
455 else if (tunnel_type
== MLX5E_TC_TUNNEL_TYPE_GRETAP
&&
456 MLX5_CAP_ESW(priv
->mdev
, nvgre_encap_decap
))
462 int mlx5e_tc_tun_init_encap_attr(struct net_device
*tunnel_dev
,
463 struct mlx5e_priv
*priv
,
464 struct mlx5e_encap_entry
*e
,
465 struct netlink_ext_ack
*extack
)
467 e
->tunnel_type
= mlx5e_tc_tun_get_type(tunnel_dev
);
469 if (e
->tunnel_type
== MLX5E_TC_TUNNEL_TYPE_VXLAN
) {
470 int dst_port
= be16_to_cpu(e
->tun_info
.key
.tp_dst
);
472 if (!mlx5_vxlan_lookup_port(priv
->mdev
->vxlan
, dst_port
)) {
473 NL_SET_ERR_MSG_MOD(extack
,
474 "vxlan udp dport was not registered with the HW");
475 netdev_warn(priv
->netdev
,
476 "%d isn't an offloaded vxlan udp dport\n",
480 e
->reformat_type
= MLX5_REFORMAT_TYPE_L2_TO_VXLAN
;
481 e
->tunnel_hlen
= VXLAN_HLEN
;
482 } else if (e
->tunnel_type
== MLX5E_TC_TUNNEL_TYPE_GRETAP
) {
483 e
->reformat_type
= MLX5_REFORMAT_TYPE_L2_TO_NVGRE
;
484 e
->tunnel_hlen
= gre_calc_hlen(e
->tun_info
.key
.tun_flags
);
486 e
->reformat_type
= -1;
493 static int mlx5e_tc_tun_parse_vxlan(struct mlx5e_priv
*priv
,
494 struct mlx5_flow_spec
*spec
,
495 struct tc_cls_flower_offload
*f
,
499 struct netlink_ext_ack
*extack
= f
->common
.extack
;
500 struct flow_dissector_key_ports
*key
=
501 skb_flow_dissector_target(f
->dissector
,
502 FLOW_DISSECTOR_KEY_ENC_PORTS
,
504 struct flow_dissector_key_ports
*mask
=
505 skb_flow_dissector_target(f
->dissector
,
506 FLOW_DISSECTOR_KEY_ENC_PORTS
,
508 void *misc_c
= MLX5_ADDR_OF(fte_match_param
,
509 spec
->match_criteria
,
511 void *misc_v
= MLX5_ADDR_OF(fte_match_param
,
515 /* Full udp dst port must be given */
516 if (!dissector_uses_key(f
->dissector
, FLOW_DISSECTOR_KEY_ENC_PORTS
) ||
517 memchr_inv(&mask
->dst
, 0xff, sizeof(mask
->dst
))) {
518 NL_SET_ERR_MSG_MOD(extack
,
519 "VXLAN decap filter must include enc_dst_port condition");
520 netdev_warn(priv
->netdev
,
521 "VXLAN decap filter must include enc_dst_port condition\n");
525 /* udp dst port must be knonwn as a VXLAN port */
526 if (!mlx5_vxlan_lookup_port(priv
->mdev
->vxlan
, be16_to_cpu(key
->dst
))) {
527 NL_SET_ERR_MSG_MOD(extack
,
528 "Matched UDP port is not registered as a VXLAN port");
529 netdev_warn(priv
->netdev
,
530 "UDP port %d is not registered as a VXLAN port\n",
531 be16_to_cpu(key
->dst
));
535 /* dst UDP port is valid here */
536 MLX5_SET_TO_ONES(fte_match_set_lyr_2_4
, headers_c
, ip_protocol
);
537 MLX5_SET(fte_match_set_lyr_2_4
, headers_v
, ip_protocol
, IPPROTO_UDP
);
539 MLX5_SET(fte_match_set_lyr_2_4
, headers_c
, udp_dport
, ntohs(mask
->dst
));
540 MLX5_SET(fte_match_set_lyr_2_4
, headers_v
, udp_dport
, ntohs(key
->dst
));
542 MLX5_SET(fte_match_set_lyr_2_4
, headers_c
, udp_sport
, ntohs(mask
->src
));
543 MLX5_SET(fte_match_set_lyr_2_4
, headers_v
, udp_sport
, ntohs(key
->src
));
546 if (dissector_uses_key(f
->dissector
, FLOW_DISSECTOR_KEY_ENC_KEYID
)) {
547 struct flow_dissector_key_keyid
*key
=
548 skb_flow_dissector_target(f
->dissector
,
549 FLOW_DISSECTOR_KEY_ENC_KEYID
,
551 struct flow_dissector_key_keyid
*mask
=
552 skb_flow_dissector_target(f
->dissector
,
553 FLOW_DISSECTOR_KEY_ENC_KEYID
,
555 MLX5_SET(fte_match_set_misc
, misc_c
, vxlan_vni
,
556 be32_to_cpu(mask
->keyid
));
557 MLX5_SET(fte_match_set_misc
, misc_v
, vxlan_vni
,
558 be32_to_cpu(key
->keyid
));
563 static int mlx5e_tc_tun_parse_gretap(struct mlx5e_priv
*priv
,
564 struct mlx5_flow_spec
*spec
,
565 struct tc_cls_flower_offload
*f
,
566 void *outer_headers_c
,
567 void *outer_headers_v
)
569 void *misc_c
= MLX5_ADDR_OF(fte_match_param
, spec
->match_criteria
,
571 void *misc_v
= MLX5_ADDR_OF(fte_match_param
, spec
->match_value
,
574 if (!MLX5_CAP_ESW(priv
->mdev
, nvgre_encap_decap
)) {
575 NL_SET_ERR_MSG_MOD(f
->common
.extack
,
576 "GRE HW offloading is not supported");
577 netdev_warn(priv
->netdev
, "GRE HW offloading is not supported\n");
581 MLX5_SET_TO_ONES(fte_match_set_lyr_2_4
, outer_headers_c
, ip_protocol
);
582 MLX5_SET(fte_match_set_lyr_2_4
, outer_headers_v
,
583 ip_protocol
, IPPROTO_GRE
);
586 MLX5_SET_TO_ONES(fte_match_set_misc
, misc_c
, gre_protocol
);
587 MLX5_SET(fte_match_set_misc
, misc_v
, gre_protocol
, ETH_P_TEB
);
590 if (dissector_uses_key(f
->dissector
, FLOW_DISSECTOR_KEY_ENC_KEYID
)) {
591 struct flow_dissector_key_keyid
*mask
= NULL
;
592 struct flow_dissector_key_keyid
*key
= NULL
;
594 mask
= skb_flow_dissector_target(f
->dissector
,
595 FLOW_DISSECTOR_KEY_ENC_KEYID
,
597 MLX5_SET(fte_match_set_misc
, misc_c
,
598 gre_key
.key
, be32_to_cpu(mask
->keyid
));
600 key
= skb_flow_dissector_target(f
->dissector
,
601 FLOW_DISSECTOR_KEY_ENC_KEYID
,
603 MLX5_SET(fte_match_set_misc
, misc_v
,
604 gre_key
.key
, be32_to_cpu(key
->keyid
));
610 int mlx5e_tc_tun_parse(struct net_device
*filter_dev
,
611 struct mlx5e_priv
*priv
,
612 struct mlx5_flow_spec
*spec
,
613 struct tc_cls_flower_offload
*f
,
620 tunnel_type
= mlx5e_tc_tun_get_type(filter_dev
);
621 if (tunnel_type
== MLX5E_TC_TUNNEL_TYPE_VXLAN
) {
622 err
= mlx5e_tc_tun_parse_vxlan(priv
, spec
, f
,
623 headers_c
, headers_v
);
624 } else if (tunnel_type
== MLX5E_TC_TUNNEL_TYPE_GRETAP
) {
625 err
= mlx5e_tc_tun_parse_gretap(priv
, spec
, f
,
626 headers_c
, headers_v
);
628 netdev_warn(priv
->netdev
,
629 "decapsulation offload is not supported for %s net device (%d)\n",
630 mlx5e_netdev_kind(filter_dev
), tunnel_type
);