]> git.proxmox.com Git - ceph.git/blob - ceph/src/seastar/dpdk/drivers/net/mlx5/mlx5_flow_tcf.c
import 15.2.0 Octopus source
[ceph.git] / ceph / src / seastar / dpdk / drivers / net / mlx5 / mlx5_flow_tcf.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2018 6WIND S.A.
3 * Copyright 2018 Mellanox Technologies, Ltd
4 */
5
6 #include <assert.h>
7 #include <errno.h>
8 #include <libmnl/libmnl.h>
9 #include <linux/gen_stats.h>
10 #include <linux/if_ether.h>
11 #include <linux/netlink.h>
12 #include <linux/pkt_cls.h>
13 #include <linux/pkt_sched.h>
14 #include <linux/rtnetlink.h>
15 #include <linux/tc_act/tc_gact.h>
16 #include <linux/tc_act/tc_mirred.h>
17 #include <netinet/in.h>
18 #include <stdalign.h>
19 #include <stdbool.h>
20 #include <stddef.h>
21 #include <stdint.h>
22 #include <stdlib.h>
23 #include <sys/socket.h>
24
25 #include <rte_byteorder.h>
26 #include <rte_errno.h>
27 #include <rte_ether.h>
28 #include <rte_flow.h>
29 #include <rte_malloc.h>
30 #include <rte_common.h>
31 #include <rte_cycles.h>
32
33 #include "mlx5.h"
34 #include "mlx5_flow.h"
35 #include "mlx5_autoconf.h"
36
37 #ifdef HAVE_TC_ACT_VLAN
38
39 #include <linux/tc_act/tc_vlan.h>
40
41 #else /* HAVE_TC_ACT_VLAN */
42
43 #define TCA_VLAN_ACT_POP 1
44 #define TCA_VLAN_ACT_PUSH 2
45 #define TCA_VLAN_ACT_MODIFY 3
46 #define TCA_VLAN_PARMS 2
47 #define TCA_VLAN_PUSH_VLAN_ID 3
48 #define TCA_VLAN_PUSH_VLAN_PROTOCOL 4
49 #define TCA_VLAN_PAD 5
50 #define TCA_VLAN_PUSH_VLAN_PRIORITY 6
51
52 struct tc_vlan {
53 tc_gen;
54 int v_action;
55 };
56
57 #endif /* HAVE_TC_ACT_VLAN */
58
59 #ifdef HAVE_TC_ACT_PEDIT
60
61 #include <linux/tc_act/tc_pedit.h>
62
63 #else /* HAVE_TC_ACT_VLAN */
64
65 enum {
66 TCA_PEDIT_UNSPEC,
67 TCA_PEDIT_TM,
68 TCA_PEDIT_PARMS,
69 TCA_PEDIT_PAD,
70 TCA_PEDIT_PARMS_EX,
71 TCA_PEDIT_KEYS_EX,
72 TCA_PEDIT_KEY_EX,
73 __TCA_PEDIT_MAX
74 };
75
76 enum {
77 TCA_PEDIT_KEY_EX_HTYPE = 1,
78 TCA_PEDIT_KEY_EX_CMD = 2,
79 __TCA_PEDIT_KEY_EX_MAX
80 };
81
82 enum pedit_header_type {
83 TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK = 0,
84 TCA_PEDIT_KEY_EX_HDR_TYPE_ETH = 1,
85 TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 = 2,
86 TCA_PEDIT_KEY_EX_HDR_TYPE_IP6 = 3,
87 TCA_PEDIT_KEY_EX_HDR_TYPE_TCP = 4,
88 TCA_PEDIT_KEY_EX_HDR_TYPE_UDP = 5,
89 __PEDIT_HDR_TYPE_MAX,
90 };
91
92 enum pedit_cmd {
93 TCA_PEDIT_KEY_EX_CMD_SET = 0,
94 TCA_PEDIT_KEY_EX_CMD_ADD = 1,
95 __PEDIT_CMD_MAX,
96 };
97
98 struct tc_pedit_key {
99 __u32 mask; /* AND */
100 __u32 val; /*XOR */
101 __u32 off; /*offset */
102 __u32 at;
103 __u32 offmask;
104 __u32 shift;
105 };
106
107 __extension__
108 struct tc_pedit_sel {
109 tc_gen;
110 unsigned char nkeys;
111 unsigned char flags;
112 struct tc_pedit_key keys[0];
113 };
114
115 #endif /* HAVE_TC_ACT_VLAN */
116
117 #ifdef HAVE_TC_ACT_TUNNEL_KEY
118
119 #include <linux/tc_act/tc_tunnel_key.h>
120
121 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT
122 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
123 #endif
124
125 #ifndef HAVE_TCA_TUNNEL_KEY_NO_CSUM
126 #define TCA_TUNNEL_KEY_NO_CSUM 10
127 #endif
128
129 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_TOS
130 #define TCA_TUNNEL_KEY_ENC_TOS 12
131 #endif
132
133 #ifndef HAVE_TCA_TUNNEL_KEY_ENC_TTL
134 #define TCA_TUNNEL_KEY_ENC_TTL 13
135 #endif
136
137 #else /* HAVE_TC_ACT_TUNNEL_KEY */
138
139 #define TCA_ACT_TUNNEL_KEY 17
140 #define TCA_TUNNEL_KEY_ACT_SET 1
141 #define TCA_TUNNEL_KEY_ACT_RELEASE 2
142 #define TCA_TUNNEL_KEY_PARMS 2
143 #define TCA_TUNNEL_KEY_ENC_IPV4_SRC 3
144 #define TCA_TUNNEL_KEY_ENC_IPV4_DST 4
145 #define TCA_TUNNEL_KEY_ENC_IPV6_SRC 5
146 #define TCA_TUNNEL_KEY_ENC_IPV6_DST 6
147 #define TCA_TUNNEL_KEY_ENC_KEY_ID 7
148 #define TCA_TUNNEL_KEY_ENC_DST_PORT 9
149 #define TCA_TUNNEL_KEY_NO_CSUM 10
150 #define TCA_TUNNEL_KEY_ENC_TOS 12
151 #define TCA_TUNNEL_KEY_ENC_TTL 13
152
153 struct tc_tunnel_key {
154 tc_gen;
155 int t_action;
156 };
157
158 #endif /* HAVE_TC_ACT_TUNNEL_KEY */
159
160 /* Normally found in linux/netlink.h. */
161 #ifndef NETLINK_CAP_ACK
162 #define NETLINK_CAP_ACK 10
163 #endif
164
165 /* Normally found in linux/pkt_sched.h. */
166 #ifndef TC_H_MIN_INGRESS
167 #define TC_H_MIN_INGRESS 0xfff2u
168 #endif
169
170 /* Normally found in linux/pkt_cls.h. */
171 #ifndef TCA_CLS_FLAGS_SKIP_SW
172 #define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
173 #endif
174 #ifndef TCA_CLS_FLAGS_IN_HW
175 #define TCA_CLS_FLAGS_IN_HW (1 << 2)
176 #endif
177 #ifndef HAVE_TCA_CHAIN
178 #define TCA_CHAIN 11
179 #endif
180 #ifndef HAVE_TCA_FLOWER_ACT
181 #define TCA_FLOWER_ACT 3
182 #endif
183 #ifndef HAVE_TCA_FLOWER_FLAGS
184 #define TCA_FLOWER_FLAGS 22
185 #endif
186 #ifndef HAVE_TCA_FLOWER_KEY_ETH_TYPE
187 #define TCA_FLOWER_KEY_ETH_TYPE 8
188 #endif
189 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST
190 #define TCA_FLOWER_KEY_ETH_DST 4
191 #endif
192 #ifndef HAVE_TCA_FLOWER_KEY_ETH_DST_MASK
193 #define TCA_FLOWER_KEY_ETH_DST_MASK 5
194 #endif
195 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC
196 #define TCA_FLOWER_KEY_ETH_SRC 6
197 #endif
198 #ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK
199 #define TCA_FLOWER_KEY_ETH_SRC_MASK 7
200 #endif
201 #ifndef HAVE_TCA_FLOWER_KEY_IP_PROTO
202 #define TCA_FLOWER_KEY_IP_PROTO 9
203 #endif
204 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC
205 #define TCA_FLOWER_KEY_IPV4_SRC 10
206 #endif
207 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK
208 #define TCA_FLOWER_KEY_IPV4_SRC_MASK 11
209 #endif
210 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST
211 #define TCA_FLOWER_KEY_IPV4_DST 12
212 #endif
213 #ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK
214 #define TCA_FLOWER_KEY_IPV4_DST_MASK 13
215 #endif
216 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC
217 #define TCA_FLOWER_KEY_IPV6_SRC 14
218 #endif
219 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK
220 #define TCA_FLOWER_KEY_IPV6_SRC_MASK 15
221 #endif
222 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST
223 #define TCA_FLOWER_KEY_IPV6_DST 16
224 #endif
225 #ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK
226 #define TCA_FLOWER_KEY_IPV6_DST_MASK 17
227 #endif
228 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC
229 #define TCA_FLOWER_KEY_TCP_SRC 18
230 #endif
231 #ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK
232 #define TCA_FLOWER_KEY_TCP_SRC_MASK 35
233 #endif
234 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST
235 #define TCA_FLOWER_KEY_TCP_DST 19
236 #endif
237 #ifndef HAVE_TCA_FLOWER_KEY_TCP_DST_MASK
238 #define TCA_FLOWER_KEY_TCP_DST_MASK 36
239 #endif
240 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC
241 #define TCA_FLOWER_KEY_UDP_SRC 20
242 #endif
243 #ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK
244 #define TCA_FLOWER_KEY_UDP_SRC_MASK 37
245 #endif
246 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST
247 #define TCA_FLOWER_KEY_UDP_DST 21
248 #endif
249 #ifndef HAVE_TCA_FLOWER_KEY_UDP_DST_MASK
250 #define TCA_FLOWER_KEY_UDP_DST_MASK 38
251 #endif
252 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ID
253 #define TCA_FLOWER_KEY_VLAN_ID 23
254 #endif
255 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_PRIO
256 #define TCA_FLOWER_KEY_VLAN_PRIO 24
257 #endif
258 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE
259 #define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
260 #endif
261 #ifndef HAVE_TCA_FLOWER_KEY_ENC_KEY_ID
262 #define TCA_FLOWER_KEY_ENC_KEY_ID 26
263 #endif
264 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC
265 #define TCA_FLOWER_KEY_ENC_IPV4_SRC 27
266 #endif
267 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK
268 #define TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK 28
269 #endif
270 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST
271 #define TCA_FLOWER_KEY_ENC_IPV4_DST 29
272 #endif
273 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST_MASK
274 #define TCA_FLOWER_KEY_ENC_IPV4_DST_MASK 30
275 #endif
276 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC
277 #define TCA_FLOWER_KEY_ENC_IPV6_SRC 31
278 #endif
279 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK
280 #define TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK 32
281 #endif
282 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST
283 #define TCA_FLOWER_KEY_ENC_IPV6_DST 33
284 #endif
285 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST_MASK
286 #define TCA_FLOWER_KEY_ENC_IPV6_DST_MASK 34
287 #endif
288 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT
289 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT 43
290 #endif
291 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK
292 #define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK 44
293 #endif
294 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT
295 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT 45
296 #endif
297 #ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK
298 #define TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK 46
299 #endif
300 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS
301 #define TCA_FLOWER_KEY_TCP_FLAGS 71
302 #endif
303 #ifndef HAVE_TCA_FLOWER_KEY_TCP_FLAGS_MASK
304 #define TCA_FLOWER_KEY_TCP_FLAGS_MASK 72
305 #endif
306 #ifndef HAVE_TCA_FLOWER_KEY_IP_TOS
307 #define TCA_FLOWER_KEY_IP_TOS 73
308 #endif
309 #ifndef HAVE_TCA_FLOWER_KEY_IP_TOS_MASK
310 #define TCA_FLOWER_KEY_IP_TOS_MASK 74
311 #endif
312 #ifndef HAVE_TCA_FLOWER_KEY_IP_TTL
313 #define TCA_FLOWER_KEY_IP_TTL 75
314 #endif
315 #ifndef HAVE_TCA_FLOWER_KEY_IP_TTL_MASK
316 #define TCA_FLOWER_KEY_IP_TTL_MASK 76
317 #endif
318 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TOS
319 #define TCA_FLOWER_KEY_ENC_IP_TOS 80
320 #endif
321 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TOS_MASK
322 #define TCA_FLOWER_KEY_ENC_IP_TOS_MASK 81
323 #endif
324 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TTL
325 #define TCA_FLOWER_KEY_ENC_IP_TTL 82
326 #endif
327 #ifndef HAVE_TCA_FLOWER_KEY_ENC_IP_TTL_MASK
328 #define TCA_FLOWER_KEY_ENC_IP_TTL_MASK 83
329 #endif
330
331 #ifndef HAVE_TC_ACT_GOTO_CHAIN
332 #define TC_ACT_GOTO_CHAIN 0x20000000
333 #endif
334
335 #ifndef IPV6_ADDR_LEN
336 #define IPV6_ADDR_LEN 16
337 #endif
338
339 #ifndef IPV4_ADDR_LEN
340 #define IPV4_ADDR_LEN 4
341 #endif
342
343 #ifndef TP_PORT_LEN
344 #define TP_PORT_LEN 2 /* Transport Port (UDP/TCP) Length */
345 #endif
346
347 #ifndef TTL_LEN
348 #define TTL_LEN 1
349 #endif
350
351 #ifndef TCA_ACT_MAX_PRIO
352 #define TCA_ACT_MAX_PRIO 32
353 #endif
354
355 /** Parameters of VXLAN devices created by driver. */
356 #define MLX5_VXLAN_DEFAULT_VNI 1
357 #define MLX5_VXLAN_DEVICE_PFX "vmlx_"
358 /**
359 * Timeout in milliseconds to wait VXLAN UDP offloaded port
360 * registration completed within the mlx5 driver.
361 */
362 #define MLX5_VXLAN_WAIT_PORT_REG_MS 250
363
364 /** Tunnel action type, used for @p type in header structure. */
365 enum flow_tcf_tunact_type {
366 FLOW_TCF_TUNACT_VXLAN_DECAP,
367 FLOW_TCF_TUNACT_VXLAN_ENCAP,
368 };
369
370 /** Flags used for @p mask in tunnel action encap descriptors. */
371 #define FLOW_TCF_ENCAP_ETH_SRC (1u << 0)
372 #define FLOW_TCF_ENCAP_ETH_DST (1u << 1)
373 #define FLOW_TCF_ENCAP_IPV4_SRC (1u << 2)
374 #define FLOW_TCF_ENCAP_IPV4_DST (1u << 3)
375 #define FLOW_TCF_ENCAP_IPV6_SRC (1u << 4)
376 #define FLOW_TCF_ENCAP_IPV6_DST (1u << 5)
377 #define FLOW_TCF_ENCAP_UDP_SRC (1u << 6)
378 #define FLOW_TCF_ENCAP_UDP_DST (1u << 7)
379 #define FLOW_TCF_ENCAP_VXLAN_VNI (1u << 8)
380 #define FLOW_TCF_ENCAP_IP_TTL (1u << 9)
381 #define FLOW_TCF_ENCAP_IP_TOS (1u << 10)
382
383 /**
384 * Structure for holding netlink context.
385 * Note the size of the message buffer which is MNL_SOCKET_BUFFER_SIZE.
386 * Using this (8KB) buffer size ensures that netlink messages will never be
387 * truncated.
388 */
389 struct mlx5_flow_tcf_context {
390 struct mnl_socket *nl; /* NETLINK_ROUTE libmnl socket. */
391 uint32_t seq; /* Message sequence number. */
392 uint32_t buf_size; /* Message buffer size. */
393 uint8_t *buf; /* Message buffer. */
394 };
395
396 /**
397 * Neigh rule structure. The neigh rule is applied via Netlink to
398 * outer tunnel iface in order to provide destination MAC address
399 * for the VXLAN encapsultion. The neigh rule is implicitly related
400 * to the Flow itself and can be shared by multiple Flows.
401 */
402 struct tcf_neigh_rule {
403 LIST_ENTRY(tcf_neigh_rule) next;
404 uint32_t refcnt;
405 struct ether_addr eth;
406 uint16_t mask;
407 union {
408 struct {
409 rte_be32_t dst;
410 } ipv4;
411 struct {
412 uint8_t dst[IPV6_ADDR_LEN];
413 } ipv6;
414 };
415 };
416
417 /**
418 * Local rule structure. The local rule is applied via Netlink to
419 * outer tunnel iface in order to provide local and peer IP addresses
420 * of the VXLAN tunnel for encapsulation. The local rule is implicitly
421 * related to the Flow itself and can be shared by multiple Flows.
422 */
423 struct tcf_local_rule {
424 LIST_ENTRY(tcf_local_rule) next;
425 uint32_t refcnt;
426 uint16_t mask;
427 union {
428 struct {
429 rte_be32_t dst;
430 rte_be32_t src;
431 } ipv4;
432 struct {
433 uint8_t dst[IPV6_ADDR_LEN];
434 uint8_t src[IPV6_ADDR_LEN];
435 } ipv6;
436 };
437 };
438
439 /** Outer interface VXLAN encapsulation rules container. */
440 struct tcf_irule {
441 LIST_ENTRY(tcf_irule) next;
442 LIST_HEAD(, tcf_neigh_rule) neigh;
443 LIST_HEAD(, tcf_local_rule) local;
444 uint32_t refcnt;
445 unsigned int ifouter; /**< Own interface index. */
446 };
447
448 /** VXLAN virtual netdev. */
449 struct tcf_vtep {
450 LIST_ENTRY(tcf_vtep) next;
451 uint32_t refcnt;
452 unsigned int ifindex; /**< Own interface index. */
453 uint16_t port;
454 uint32_t created:1; /**< Actually created by PMD. */
455 uint32_t waitreg:1; /**< Wait for VXLAN UDP port registration. */
456 };
457
458 /** Tunnel descriptor header, common for all tunnel types. */
459 struct flow_tcf_tunnel_hdr {
460 uint32_t type; /**< Tunnel action type. */
461 struct tcf_vtep *vtep; /**< Virtual tunnel endpoint device. */
462 unsigned int ifindex_org; /**< Original dst/src interface */
463 unsigned int *ifindex_ptr; /**< Interface ptr in message. */
464 };
465
466 struct flow_tcf_vxlan_decap {
467 struct flow_tcf_tunnel_hdr hdr;
468 uint16_t udp_port;
469 };
470
471 struct flow_tcf_vxlan_encap {
472 struct flow_tcf_tunnel_hdr hdr;
473 struct tcf_irule *iface;
474 uint32_t mask;
475 uint8_t ip_tos;
476 uint8_t ip_ttl_hop;
477 struct {
478 struct ether_addr dst;
479 struct ether_addr src;
480 } eth;
481 union {
482 struct {
483 rte_be32_t dst;
484 rte_be32_t src;
485 } ipv4;
486 struct {
487 uint8_t dst[IPV6_ADDR_LEN];
488 uint8_t src[IPV6_ADDR_LEN];
489 } ipv6;
490 };
491 struct {
492 rte_be16_t src;
493 rte_be16_t dst;
494 } udp;
495 struct {
496 uint8_t vni[3];
497 } vxlan;
498 };
499
500 /** Structure used when extracting the values of a flow counters
501 * from a netlink message.
502 */
503 struct flow_tcf_stats_basic {
504 bool valid;
505 struct gnet_stats_basic counters;
506 };
507
508 /** Empty masks for known item types. */
509 static const union {
510 struct rte_flow_item_port_id port_id;
511 struct rte_flow_item_eth eth;
512 struct rte_flow_item_vlan vlan;
513 struct rte_flow_item_ipv4 ipv4;
514 struct rte_flow_item_ipv6 ipv6;
515 struct rte_flow_item_tcp tcp;
516 struct rte_flow_item_udp udp;
517 struct rte_flow_item_vxlan vxlan;
518 } flow_tcf_mask_empty = {
519 {0},
520 };
521
522 /** Supported masks for known item types. */
523 static const struct {
524 struct rte_flow_item_port_id port_id;
525 struct rte_flow_item_eth eth;
526 struct rte_flow_item_vlan vlan;
527 struct rte_flow_item_ipv4 ipv4;
528 struct rte_flow_item_ipv6 ipv6;
529 struct rte_flow_item_tcp tcp;
530 struct rte_flow_item_udp udp;
531 struct rte_flow_item_vxlan vxlan;
532 } flow_tcf_mask_supported = {
533 .port_id = {
534 .id = 0xffffffff,
535 },
536 .eth = {
537 .type = RTE_BE16(0xffff),
538 .dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
539 .src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
540 },
541 .vlan = {
542 /* PCP and VID only, no DEI. */
543 .tci = RTE_BE16(0xefff),
544 .inner_type = RTE_BE16(0xffff),
545 },
546 .ipv4.hdr = {
547 .next_proto_id = 0xff,
548 .time_to_live = 0xff,
549 .type_of_service = 0xff,
550 .src_addr = RTE_BE32(0xffffffff),
551 .dst_addr = RTE_BE32(0xffffffff),
552 },
553 .ipv6.hdr = {
554 .proto = 0xff,
555 .vtc_flow = RTE_BE32(0xfful << IPV6_HDR_FL_SHIFT),
556 .hop_limits = 0xff,
557 .src_addr =
558 "\xff\xff\xff\xff\xff\xff\xff\xff"
559 "\xff\xff\xff\xff\xff\xff\xff\xff",
560 .dst_addr =
561 "\xff\xff\xff\xff\xff\xff\xff\xff"
562 "\xff\xff\xff\xff\xff\xff\xff\xff",
563 },
564 .tcp.hdr = {
565 .src_port = RTE_BE16(0xffff),
566 .dst_port = RTE_BE16(0xffff),
567 .tcp_flags = 0xff,
568 },
569 .udp.hdr = {
570 .src_port = RTE_BE16(0xffff),
571 .dst_port = RTE_BE16(0xffff),
572 },
573 .vxlan = {
574 .vni = "\xff\xff\xff",
575 },
576 };
577
578 #define SZ_NLATTR_HDR MNL_ALIGN(sizeof(struct nlattr))
579 #define SZ_NLATTR_NEST SZ_NLATTR_HDR
580 #define SZ_NLATTR_DATA_OF(len) MNL_ALIGN(SZ_NLATTR_HDR + (len))
581 #define SZ_NLATTR_TYPE_OF(typ) SZ_NLATTR_DATA_OF(sizeof(typ))
582 #define SZ_NLATTR_STRZ_OF(str) SZ_NLATTR_DATA_OF(strlen(str) + 1)
583
584 #define PTOI_TABLE_SZ_MAX(dev) (mlx5_dev_to_port_id((dev)->device, NULL, 0) + 2)
585
586 /** DPDK port to network interface index (ifindex) conversion. */
587 struct flow_tcf_ptoi {
588 uint16_t port_id; /**< DPDK port ID. */
589 unsigned int ifindex; /**< Network interface index. */
590 };
591
592 /* Due to a limitation on driver/FW. */
593 #define MLX5_TCF_GROUP_ID_MAX 3
594
595 /*
596 * Due to a limitation on driver/FW, priority ranges from 1 to 16 in kernel.
597 * Priority in rte_flow attribute starts from 0 and is added by 1 in
598 * translation. This is subject to be changed to determine the max priority
599 * based on trial-and-error like Verbs driver once the restriction is lifted or
600 * the range is extended.
601 */
602 #define MLX5_TCF_GROUP_PRIORITY_MAX 15
603
604 #define MLX5_TCF_FATE_ACTIONS \
605 (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID | \
606 MLX5_FLOW_ACTION_JUMP)
607
608 #define MLX5_TCF_VLAN_ACTIONS \
609 (MLX5_FLOW_ACTION_OF_POP_VLAN | MLX5_FLOW_ACTION_OF_PUSH_VLAN | \
610 MLX5_FLOW_ACTION_OF_SET_VLAN_VID | MLX5_FLOW_ACTION_OF_SET_VLAN_PCP)
611
612 #define MLX5_TCF_VXLAN_ACTIONS \
613 (MLX5_FLOW_ACTION_VXLAN_ENCAP | MLX5_FLOW_ACTION_VXLAN_DECAP)
614
615 #define MLX5_TCF_PEDIT_ACTIONS \
616 (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST | \
617 MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST | \
618 MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST | \
619 MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL | \
620 MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)
621
622 #define MLX5_TCF_CONFIG_ACTIONS \
623 (MLX5_FLOW_ACTION_PORT_ID | MLX5_FLOW_ACTION_JUMP | \
624 MLX5_FLOW_ACTION_OF_PUSH_VLAN | MLX5_FLOW_ACTION_OF_SET_VLAN_VID | \
625 MLX5_FLOW_ACTION_OF_SET_VLAN_PCP | \
626 (MLX5_TCF_PEDIT_ACTIONS & ~MLX5_FLOW_ACTION_DEC_TTL))
627
628 #define MAX_PEDIT_KEYS 128
629 #define SZ_PEDIT_KEY_VAL 4
630
631 #define NUM_OF_PEDIT_KEYS(sz) \
632 (((sz) / SZ_PEDIT_KEY_VAL) + (((sz) % SZ_PEDIT_KEY_VAL) ? 1 : 0))
633
634 struct pedit_key_ex {
635 enum pedit_header_type htype;
636 enum pedit_cmd cmd;
637 };
638
639 struct pedit_parser {
640 struct tc_pedit_sel sel;
641 struct tc_pedit_key keys[MAX_PEDIT_KEYS];
642 struct pedit_key_ex keys_ex[MAX_PEDIT_KEYS];
643 };
644
645 /**
646 * Create space for using the implicitly created TC flow counter.
647 *
648 * @param[in] dev
649 * Pointer to the Ethernet device structure.
650 *
651 * @return
652 * A pointer to the counter data structure, NULL otherwise and
653 * rte_errno is set.
654 */
655 static struct mlx5_flow_counter *
656 flow_tcf_counter_new(void)
657 {
658 struct mlx5_flow_counter *cnt;
659
660 /*
661 * eswitch counter cannot be shared and its id is unknown.
662 * currently returning all with id 0.
663 * in the future maybe better to switch to unique numbers.
664 */
665 struct mlx5_flow_counter tmpl = {
666 .ref_cnt = 1,
667 };
668 cnt = rte_calloc(__func__, 1, sizeof(*cnt), 0);
669 if (!cnt) {
670 rte_errno = ENOMEM;
671 return NULL;
672 }
673 *cnt = tmpl;
674 /* Implicit counter, do not add to list. */
675 return cnt;
676 }
677
678 /**
679 * Set pedit key of MAC address
680 *
681 * @param[in] actions
682 * pointer to action specification
683 * @param[in,out] p_parser
684 * pointer to pedit_parser
685 */
686 static void
687 flow_tcf_pedit_key_set_mac(const struct rte_flow_action *actions,
688 struct pedit_parser *p_parser)
689 {
690 int idx = p_parser->sel.nkeys;
691 uint32_t off = actions->type == RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ?
692 offsetof(struct ether_hdr, s_addr) :
693 offsetof(struct ether_hdr, d_addr);
694 const struct rte_flow_action_set_mac *conf =
695 (const struct rte_flow_action_set_mac *)actions->conf;
696
697 p_parser->keys[idx].off = off;
698 p_parser->keys[idx].mask = ~UINT32_MAX;
699 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
700 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
701 memcpy(&p_parser->keys[idx].val,
702 conf->mac_addr, SZ_PEDIT_KEY_VAL);
703 idx++;
704 p_parser->keys[idx].off = off + SZ_PEDIT_KEY_VAL;
705 p_parser->keys[idx].mask = 0xFFFF0000;
706 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_ETH;
707 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
708 memcpy(&p_parser->keys[idx].val,
709 conf->mac_addr + SZ_PEDIT_KEY_VAL,
710 ETHER_ADDR_LEN - SZ_PEDIT_KEY_VAL);
711 p_parser->sel.nkeys = (++idx);
712 }
713
714 /**
715 * Set pedit key of decrease/set ttl
716 *
717 * @param[in] actions
718 * pointer to action specification
719 * @param[in,out] p_parser
720 * pointer to pedit_parser
721 * @param[in] item_flags
722 * flags of all items presented
723 */
724 static void
725 flow_tcf_pedit_key_set_dec_ttl(const struct rte_flow_action *actions,
726 struct pedit_parser *p_parser,
727 uint64_t item_flags)
728 {
729 int idx = p_parser->sel.nkeys;
730
731 p_parser->keys[idx].mask = 0xFFFFFF00;
732 if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4) {
733 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
734 p_parser->keys[idx].off =
735 offsetof(struct ipv4_hdr, time_to_live);
736 }
737 if (item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6) {
738 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
739 p_parser->keys[idx].off =
740 offsetof(struct ipv6_hdr, hop_limits);
741 }
742 if (actions->type == RTE_FLOW_ACTION_TYPE_DEC_TTL) {
743 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_ADD;
744 p_parser->keys[idx].val = 0x000000FF;
745 } else {
746 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
747 p_parser->keys[idx].val =
748 (__u32)((const struct rte_flow_action_set_ttl *)
749 actions->conf)->ttl_value;
750 }
751 p_parser->sel.nkeys = (++idx);
752 }
753
754 /**
755 * Set pedit key of transport (TCP/UDP) port value
756 *
757 * @param[in] actions
758 * pointer to action specification
759 * @param[in,out] p_parser
760 * pointer to pedit_parser
761 * @param[in] item_flags
762 * flags of all items presented
763 */
764 static void
765 flow_tcf_pedit_key_set_tp_port(const struct rte_flow_action *actions,
766 struct pedit_parser *p_parser,
767 uint64_t item_flags)
768 {
769 int idx = p_parser->sel.nkeys;
770
771 if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP)
772 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_UDP;
773 if (item_flags & MLX5_FLOW_LAYER_OUTER_L4_TCP)
774 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_TCP;
775 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
776 /* offset of src/dst port is same for TCP and UDP */
777 p_parser->keys[idx].off =
778 actions->type == RTE_FLOW_ACTION_TYPE_SET_TP_SRC ?
779 offsetof(struct tcp_hdr, src_port) :
780 offsetof(struct tcp_hdr, dst_port);
781 p_parser->keys[idx].mask = 0xFFFF0000;
782 p_parser->keys[idx].val =
783 (__u32)((const struct rte_flow_action_set_tp *)
784 actions->conf)->port;
785 p_parser->sel.nkeys = (++idx);
786 }
787
788 /**
789 * Set pedit key of ipv6 address
790 *
791 * @param[in] actions
792 * pointer to action specification
793 * @param[in,out] p_parser
794 * pointer to pedit_parser
795 */
796 static void
797 flow_tcf_pedit_key_set_ipv6_addr(const struct rte_flow_action *actions,
798 struct pedit_parser *p_parser)
799 {
800 int idx = p_parser->sel.nkeys;
801 int keys = NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
802 int off_base =
803 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ?
804 offsetof(struct ipv6_hdr, src_addr) :
805 offsetof(struct ipv6_hdr, dst_addr);
806 const struct rte_flow_action_set_ipv6 *conf =
807 (const struct rte_flow_action_set_ipv6 *)actions->conf;
808
809 for (int i = 0; i < keys; i++, idx++) {
810 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP6;
811 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
812 p_parser->keys[idx].off = off_base + i * SZ_PEDIT_KEY_VAL;
813 p_parser->keys[idx].mask = ~UINT32_MAX;
814 memcpy(&p_parser->keys[idx].val,
815 conf->ipv6_addr + i * SZ_PEDIT_KEY_VAL,
816 SZ_PEDIT_KEY_VAL);
817 }
818 p_parser->sel.nkeys += keys;
819 }
820
821 /**
822 * Set pedit key of ipv4 address
823 *
824 * @param[in] actions
825 * pointer to action specification
826 * @param[in,out] p_parser
827 * pointer to pedit_parser
828 */
829 static void
830 flow_tcf_pedit_key_set_ipv4_addr(const struct rte_flow_action *actions,
831 struct pedit_parser *p_parser)
832 {
833 int idx = p_parser->sel.nkeys;
834
835 p_parser->keys_ex[idx].htype = TCA_PEDIT_KEY_EX_HDR_TYPE_IP4;
836 p_parser->keys_ex[idx].cmd = TCA_PEDIT_KEY_EX_CMD_SET;
837 p_parser->keys[idx].off =
838 actions->type == RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ?
839 offsetof(struct ipv4_hdr, src_addr) :
840 offsetof(struct ipv4_hdr, dst_addr);
841 p_parser->keys[idx].mask = ~UINT32_MAX;
842 p_parser->keys[idx].val =
843 ((const struct rte_flow_action_set_ipv4 *)
844 actions->conf)->ipv4_addr;
845 p_parser->sel.nkeys = (++idx);
846 }
847
848 /**
849 * Create the pedit's na attribute in netlink message
850 * on pre-allocate message buffer
851 *
852 * @param[in,out] nl
853 * pointer to pre-allocated netlink message buffer
854 * @param[in,out] actions
855 * pointer to pointer of actions specification.
856 * @param[in,out] action_flags
857 * pointer to actions flags
858 * @param[in] item_flags
859 * flags of all item presented
860 */
861 static void
862 flow_tcf_create_pedit_mnl_msg(struct nlmsghdr *nl,
863 const struct rte_flow_action **actions,
864 uint64_t item_flags)
865 {
866 struct pedit_parser p_parser;
867 struct nlattr *na_act_options;
868 struct nlattr *na_pedit_keys;
869
870 memset(&p_parser, 0, sizeof(p_parser));
871 mnl_attr_put_strz(nl, TCA_ACT_KIND, "pedit");
872 na_act_options = mnl_attr_nest_start(nl, TCA_ACT_OPTIONS);
873 /* all modify header actions should be in one tc-pedit action */
874 for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
875 switch ((*actions)->type) {
876 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
877 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
878 flow_tcf_pedit_key_set_ipv4_addr(*actions, &p_parser);
879 break;
880 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
881 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
882 flow_tcf_pedit_key_set_ipv6_addr(*actions, &p_parser);
883 break;
884 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
885 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
886 flow_tcf_pedit_key_set_tp_port(*actions,
887 &p_parser, item_flags);
888 break;
889 case RTE_FLOW_ACTION_TYPE_SET_TTL:
890 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
891 flow_tcf_pedit_key_set_dec_ttl(*actions,
892 &p_parser, item_flags);
893 break;
894 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
895 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
896 flow_tcf_pedit_key_set_mac(*actions, &p_parser);
897 break;
898 default:
899 goto pedit_mnl_msg_done;
900 }
901 }
902 pedit_mnl_msg_done:
903 p_parser.sel.action = TC_ACT_PIPE;
904 mnl_attr_put(nl, TCA_PEDIT_PARMS_EX,
905 sizeof(p_parser.sel) +
906 p_parser.sel.nkeys * sizeof(struct tc_pedit_key),
907 &p_parser);
908 na_pedit_keys =
909 mnl_attr_nest_start(nl, TCA_PEDIT_KEYS_EX | NLA_F_NESTED);
910 for (int i = 0; i < p_parser.sel.nkeys; i++) {
911 struct nlattr *na_pedit_key =
912 mnl_attr_nest_start(nl,
913 TCA_PEDIT_KEY_EX | NLA_F_NESTED);
914 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_HTYPE,
915 p_parser.keys_ex[i].htype);
916 mnl_attr_put_u16(nl, TCA_PEDIT_KEY_EX_CMD,
917 p_parser.keys_ex[i].cmd);
918 mnl_attr_nest_end(nl, na_pedit_key);
919 }
920 mnl_attr_nest_end(nl, na_pedit_keys);
921 mnl_attr_nest_end(nl, na_act_options);
922 (*actions)--;
923 }
924
925 /**
926 * Calculate max memory size of one TC-pedit actions.
927 * One TC-pedit action can contain set of keys each defining
928 * a rewrite element (rte_flow action)
929 *
930 * @param[in,out] actions
931 * actions specification.
932 * @param[in,out] action_flags
933 * actions flags
934 * @param[in,out] size
935 * accumulated size
936 * @return
937 * Max memory size of one TC-pedit action
938 */
939 static int
940 flow_tcf_get_pedit_actions_size(const struct rte_flow_action **actions,
941 uint64_t *action_flags)
942 {
943 int pedit_size = 0;
944 int keys = 0;
945 uint64_t flags = 0;
946
947 pedit_size += SZ_NLATTR_NEST + /* na_act_index. */
948 SZ_NLATTR_STRZ_OF("pedit") +
949 SZ_NLATTR_NEST; /* TCA_ACT_OPTIONS. */
950 for (; (*actions)->type != RTE_FLOW_ACTION_TYPE_END; (*actions)++) {
951 switch ((*actions)->type) {
952 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
953 keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
954 flags |= MLX5_FLOW_ACTION_SET_IPV4_SRC;
955 break;
956 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
957 keys += NUM_OF_PEDIT_KEYS(IPV4_ADDR_LEN);
958 flags |= MLX5_FLOW_ACTION_SET_IPV4_DST;
959 break;
960 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
961 keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
962 flags |= MLX5_FLOW_ACTION_SET_IPV6_SRC;
963 break;
964 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
965 keys += NUM_OF_PEDIT_KEYS(IPV6_ADDR_LEN);
966 flags |= MLX5_FLOW_ACTION_SET_IPV6_DST;
967 break;
968 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
969 /* TCP is as same as UDP */
970 keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
971 flags |= MLX5_FLOW_ACTION_SET_TP_SRC;
972 break;
973 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
974 /* TCP is as same as UDP */
975 keys += NUM_OF_PEDIT_KEYS(TP_PORT_LEN);
976 flags |= MLX5_FLOW_ACTION_SET_TP_DST;
977 break;
978 case RTE_FLOW_ACTION_TYPE_SET_TTL:
979 keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
980 flags |= MLX5_FLOW_ACTION_SET_TTL;
981 break;
982 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
983 keys += NUM_OF_PEDIT_KEYS(TTL_LEN);
984 flags |= MLX5_FLOW_ACTION_DEC_TTL;
985 break;
986 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
987 keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
988 flags |= MLX5_FLOW_ACTION_SET_MAC_SRC;
989 break;
990 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
991 keys += NUM_OF_PEDIT_KEYS(ETHER_ADDR_LEN);
992 flags |= MLX5_FLOW_ACTION_SET_MAC_DST;
993 break;
994 default:
995 goto get_pedit_action_size_done;
996 }
997 }
998 get_pedit_action_size_done:
999 /* TCA_PEDIT_PARAMS_EX */
1000 pedit_size +=
1001 SZ_NLATTR_DATA_OF(sizeof(struct tc_pedit_sel) +
1002 keys * sizeof(struct tc_pedit_key));
1003 pedit_size += SZ_NLATTR_NEST; /* TCA_PEDIT_KEYS */
1004 pedit_size += keys *
1005 /* TCA_PEDIT_KEY_EX + HTYPE + CMD */
1006 (SZ_NLATTR_NEST + SZ_NLATTR_DATA_OF(2) +
1007 SZ_NLATTR_DATA_OF(2));
1008 (*action_flags) |= flags;
1009 (*actions)--;
1010 return pedit_size;
1011 }
1012
1013 /**
1014 * Retrieve mask for pattern item.
1015 *
1016 * This function does basic sanity checks on a pattern item in order to
1017 * return the most appropriate mask for it.
1018 *
1019 * @param[in] item
1020 * Item specification.
1021 * @param[in] mask_default
1022 * Default mask for pattern item as specified by the flow API.
1023 * @param[in] mask_supported
1024 * Mask fields supported by the implementation.
1025 * @param[in] mask_empty
1026 * Empty mask to return when there is no specification.
1027 * @param[out] error
1028 * Perform verbose error reporting if not NULL.
1029 *
1030 * @return
1031 * Either @p item->mask or one of the mask parameters on success, NULL
1032 * otherwise and rte_errno is set.
1033 */
1034 static const void *
1035 flow_tcf_item_mask(const struct rte_flow_item *item, const void *mask_default,
1036 const void *mask_supported, const void *mask_empty,
1037 size_t mask_size, struct rte_flow_error *error)
1038 {
1039 const uint8_t *mask;
1040 size_t i;
1041
1042 /* item->last and item->mask cannot exist without item->spec. */
1043 if (!item->spec && (item->mask || item->last)) {
1044 rte_flow_error_set(error, EINVAL,
1045 RTE_FLOW_ERROR_TYPE_ITEM, item,
1046 "\"mask\" or \"last\" field provided without"
1047 " a corresponding \"spec\"");
1048 return NULL;
1049 }
1050 /* No spec, no mask, no problem. */
1051 if (!item->spec)
1052 return mask_empty;
1053 mask = item->mask ? item->mask : mask_default;
1054 assert(mask);
1055 /*
1056 * Single-pass check to make sure that:
1057 * - Mask is supported, no bits are set outside mask_supported.
1058 * - Both item->spec and item->last are included in mask.
1059 */
1060 for (i = 0; i != mask_size; ++i) {
1061 if (!mask[i])
1062 continue;
1063 if ((mask[i] | ((const uint8_t *)mask_supported)[i]) !=
1064 ((const uint8_t *)mask_supported)[i]) {
1065 rte_flow_error_set(error, ENOTSUP,
1066 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1067 "unsupported field found"
1068 " in \"mask\"");
1069 return NULL;
1070 }
1071 if (item->last &&
1072 (((const uint8_t *)item->spec)[i] & mask[i]) !=
1073 (((const uint8_t *)item->last)[i] & mask[i])) {
1074 rte_flow_error_set(error, EINVAL,
1075 RTE_FLOW_ERROR_TYPE_ITEM_LAST,
1076 item->last,
1077 "range between \"spec\" and \"last\""
1078 " not comprised in \"mask\"");
1079 return NULL;
1080 }
1081 }
1082 return mask;
1083 }
1084
1085 /**
1086 * Build a conversion table between port ID and ifindex.
1087 *
1088 * @param[in] dev
1089 * Pointer to Ethernet device.
1090 * @param[out] ptoi
1091 * Pointer to ptoi table.
1092 * @param[in] len
1093 * Size of ptoi table provided.
1094 *
1095 * @return
1096 * Size of ptoi table filled.
1097 */
1098 static unsigned int
1099 flow_tcf_build_ptoi_table(struct rte_eth_dev *dev, struct flow_tcf_ptoi *ptoi,
1100 unsigned int len)
1101 {
1102 unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0);
1103 uint16_t port_id[n + 1];
1104 unsigned int i;
1105 unsigned int own = 0;
1106
1107 /* At least one port is needed when no switch domain is present. */
1108 if (!n) {
1109 n = 1;
1110 port_id[0] = dev->data->port_id;
1111 } else {
1112 n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n);
1113 }
1114 if (n > len)
1115 return 0;
1116 for (i = 0; i != n; ++i) {
1117 struct rte_eth_dev_info dev_info;
1118
1119 rte_eth_dev_info_get(port_id[i], &dev_info);
1120 if (port_id[i] == dev->data->port_id)
1121 own = i;
1122 ptoi[i].port_id = port_id[i];
1123 ptoi[i].ifindex = dev_info.if_index;
1124 }
1125 /* Ensure first entry of ptoi[] is the current device. */
1126 if (own) {
1127 ptoi[n] = ptoi[0];
1128 ptoi[0] = ptoi[own];
1129 ptoi[own] = ptoi[n];
1130 }
1131 /* An entry with zero ifindex terminates ptoi[]. */
1132 ptoi[n].port_id = 0;
1133 ptoi[n].ifindex = 0;
1134 return n;
1135 }
1136
1137 /**
1138 * Verify the @p attr will be correctly understood by the E-switch.
1139 *
1140 * @param[in] attr
1141 * Pointer to flow attributes
1142 * @param[out] error
1143 * Pointer to error structure.
1144 *
1145 * @return
1146 * 0 on success, a negative errno value otherwise and rte_errno is set.
1147 */
1148 static int
1149 flow_tcf_validate_attributes(const struct rte_flow_attr *attr,
1150 struct rte_flow_error *error)
1151 {
1152 /*
1153 * Supported attributes: groups, some priorities and ingress only.
1154 * group is supported only if kernel supports chain. Don't care about
1155 * transfer as it is the caller's problem.
1156 */
1157 if (attr->group > MLX5_TCF_GROUP_ID_MAX)
1158 return rte_flow_error_set(error, ENOTSUP,
1159 RTE_FLOW_ERROR_TYPE_ATTR_GROUP, attr,
1160 "group ID larger than "
1161 RTE_STR(MLX5_TCF_GROUP_ID_MAX)
1162 " isn't supported");
1163 else if (attr->priority > MLX5_TCF_GROUP_PRIORITY_MAX)
1164 return rte_flow_error_set(error, ENOTSUP,
1165 RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
1166 attr,
1167 "priority more than "
1168 RTE_STR(MLX5_TCF_GROUP_PRIORITY_MAX)
1169 " is not supported");
1170 if (!attr->ingress)
1171 return rte_flow_error_set(error, EINVAL,
1172 RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1173 attr, "only ingress is supported");
1174 if (attr->egress)
1175 return rte_flow_error_set(error, ENOTSUP,
1176 RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
1177 attr, "egress is not supported");
1178 return 0;
1179 }
1180
1181 /**
1182 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_ETH item for E-Switch.
1183 * The routine checks the L2 fields to be used in encapsulation header.
1184 *
1185 * @param[in] item
1186 * Pointer to the item structure.
1187 * @param[out] error
1188 * Pointer to the error structure.
1189 *
1190 * @return
1191 * 0 on success, a negative errno value otherwise and rte_errno is set.
1192 **/
1193 static int
1194 flow_tcf_validate_vxlan_encap_eth(const struct rte_flow_item *item,
1195 struct rte_flow_error *error)
1196 {
1197 const struct rte_flow_item_eth *spec = item->spec;
1198 const struct rte_flow_item_eth *mask = item->mask;
1199
1200 if (!spec) {
1201 /*
1202 * Specification for L2 addresses can be empty
1203 * because these ones are optional and not
1204 * required directly by tc rule. Kernel tries
1205 * to resolve these ones on its own
1206 */
1207 return 0;
1208 }
1209 if (!mask) {
1210 /* If mask is not specified use the default one. */
1211 mask = &rte_flow_item_eth_mask;
1212 }
1213 if (memcmp(&mask->dst,
1214 &flow_tcf_mask_empty.eth.dst,
1215 sizeof(flow_tcf_mask_empty.eth.dst))) {
1216 if (memcmp(&mask->dst,
1217 &rte_flow_item_eth_mask.dst,
1218 sizeof(rte_flow_item_eth_mask.dst)))
1219 return rte_flow_error_set
1220 (error, ENOTSUP,
1221 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1222 "no support for partial mask on"
1223 " \"eth.dst\" field");
1224 }
1225 if (memcmp(&mask->src,
1226 &flow_tcf_mask_empty.eth.src,
1227 sizeof(flow_tcf_mask_empty.eth.src))) {
1228 if (memcmp(&mask->src,
1229 &rte_flow_item_eth_mask.src,
1230 sizeof(rte_flow_item_eth_mask.src)))
1231 return rte_flow_error_set
1232 (error, ENOTSUP,
1233 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1234 "no support for partial mask on"
1235 " \"eth.src\" field");
1236 }
1237 if (mask->type != RTE_BE16(0x0000)) {
1238 if (mask->type != RTE_BE16(0xffff))
1239 return rte_flow_error_set
1240 (error, ENOTSUP,
1241 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1242 "no support for partial mask on"
1243 " \"eth.type\" field");
1244 DRV_LOG(WARNING,
1245 "outer ethernet type field"
1246 " cannot be forced for vxlan"
1247 " encapsulation, parameter ignored");
1248 }
1249 return 0;
1250 }
1251
1252 /**
1253 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV4 item for E-Switch.
1254 * The routine checks the IPv4 fields to be used in encapsulation header.
1255 *
1256 * @param[in] item
1257 * Pointer to the item structure.
1258 * @param[out] error
1259 * Pointer to the error structure.
1260 *
1261 * @return
1262 * 0 on success, a negative errno value otherwise and rte_errno is set.
1263 **/
1264 static int
1265 flow_tcf_validate_vxlan_encap_ipv4(const struct rte_flow_item *item,
1266 struct rte_flow_error *error)
1267 {
1268 const struct rte_flow_item_ipv4 *spec = item->spec;
1269 const struct rte_flow_item_ipv4 *mask = item->mask;
1270
1271 if (!spec) {
1272 /*
1273 * Specification for IP addresses cannot be empty
1274 * because it is required by tunnel_key parameter.
1275 */
1276 return rte_flow_error_set(error, EINVAL,
1277 RTE_FLOW_ERROR_TYPE_ITEM, item,
1278 "NULL outer ipv4 address"
1279 " specification for vxlan"
1280 " encapsulation");
1281 }
1282 if (!mask)
1283 mask = &rte_flow_item_ipv4_mask;
1284 if (mask->hdr.dst_addr != RTE_BE32(0x00000000)) {
1285 if (mask->hdr.dst_addr != RTE_BE32(0xffffffff))
1286 return rte_flow_error_set
1287 (error, ENOTSUP,
1288 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1289 "no support for partial mask on"
1290 " \"ipv4.hdr.dst_addr\" field"
1291 " for vxlan encapsulation");
1292 /* More IPv4 address validations can be put here. */
1293 } else {
1294 /*
1295 * Kernel uses the destination IP address to determine
1296 * the routing path and obtain the MAC destination
1297 * address, so IP destination address must be
1298 * specified in the tc rule.
1299 */
1300 return rte_flow_error_set(error, EINVAL,
1301 RTE_FLOW_ERROR_TYPE_ITEM, item,
1302 "outer ipv4 destination address"
1303 " must be specified for"
1304 " vxlan encapsulation");
1305 }
1306 if (mask->hdr.src_addr != RTE_BE32(0x00000000)) {
1307 if (mask->hdr.src_addr != RTE_BE32(0xffffffff))
1308 return rte_flow_error_set
1309 (error, ENOTSUP,
1310 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1311 "no support for partial mask on"
1312 " \"ipv4.hdr.src_addr\" field"
1313 " for vxlan encapsulation");
1314 /* More IPv4 address validations can be put here. */
1315 } else {
1316 /*
1317 * Kernel uses the source IP address to select the
1318 * interface for egress encapsulated traffic, so
1319 * it must be specified in the tc rule.
1320 */
1321 return rte_flow_error_set(error, EINVAL,
1322 RTE_FLOW_ERROR_TYPE_ITEM, item,
1323 "outer ipv4 source address"
1324 " must be specified for"
1325 " vxlan encapsulation");
1326 }
1327 if (mask->hdr.type_of_service &&
1328 mask->hdr.type_of_service != 0xff)
1329 return rte_flow_error_set(error, ENOTSUP,
1330 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1331 "no support for partial mask on"
1332 " \"ipv4.hdr.type_of_service\" field"
1333 " for vxlan encapsulation");
1334 if (mask->hdr.time_to_live &&
1335 mask->hdr.time_to_live != 0xff)
1336 return rte_flow_error_set(error, ENOTSUP,
1337 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1338 "no support for partial mask on"
1339 " \"ipv4.hdr.time_to_live\" field"
1340 " for vxlan encapsulation");
1341 return 0;
1342 }
1343
1344 /**
1345 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_IPV6 item for E-Switch.
1346 * The routine checks the IPv6 fields to be used in encapsulation header.
1347 *
1348 * @param[in] item
1349 * Pointer to the item structure.
1350 * @param[out] error
1351 * Pointer to the error structure.
1352 *
1353 * @return
1354 * 0 on success, a negative errno value otherwise and rte_errno is set.
1355 **/
1356 static int
1357 flow_tcf_validate_vxlan_encap_ipv6(const struct rte_flow_item *item,
1358 struct rte_flow_error *error)
1359 {
1360 const struct rte_flow_item_ipv6 *spec = item->spec;
1361 const struct rte_flow_item_ipv6 *mask = item->mask;
1362 uint8_t msk6;
1363
1364 if (!spec) {
1365 /*
1366 * Specification for IP addresses cannot be empty
1367 * because it is required by tunnel_key parameter.
1368 */
1369 return rte_flow_error_set(error, EINVAL,
1370 RTE_FLOW_ERROR_TYPE_ITEM, item,
1371 "NULL outer ipv6 address"
1372 " specification for"
1373 " vxlan encapsulation");
1374 }
1375 if (!mask)
1376 mask = &rte_flow_item_ipv6_mask;
1377 if (memcmp(&mask->hdr.dst_addr,
1378 &flow_tcf_mask_empty.ipv6.hdr.dst_addr,
1379 IPV6_ADDR_LEN)) {
1380 if (memcmp(&mask->hdr.dst_addr,
1381 &rte_flow_item_ipv6_mask.hdr.dst_addr,
1382 IPV6_ADDR_LEN))
1383 return rte_flow_error_set
1384 (error, ENOTSUP,
1385 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1386 "no support for partial mask on"
1387 " \"ipv6.hdr.dst_addr\" field"
1388 " for vxlan encapsulation");
1389 /* More IPv6 address validations can be put here. */
1390 } else {
1391 /*
1392 * Kernel uses the destination IP address to determine
1393 * the routing path and obtain the MAC destination
1394 * address (heigh or gate), so IP destination address
1395 * must be specified within the tc rule.
1396 */
1397 return rte_flow_error_set(error, EINVAL,
1398 RTE_FLOW_ERROR_TYPE_ITEM, item,
1399 "outer ipv6 destination address"
1400 " must be specified for"
1401 " vxlan encapsulation");
1402 }
1403 if (memcmp(&mask->hdr.src_addr,
1404 &flow_tcf_mask_empty.ipv6.hdr.src_addr,
1405 IPV6_ADDR_LEN)) {
1406 if (memcmp(&mask->hdr.src_addr,
1407 &rte_flow_item_ipv6_mask.hdr.src_addr,
1408 IPV6_ADDR_LEN))
1409 return rte_flow_error_set
1410 (error, ENOTSUP,
1411 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1412 "no support for partial mask on"
1413 " \"ipv6.hdr.src_addr\" field"
1414 " for vxlan encapsulation");
1415 /* More L3 address validation can be put here. */
1416 } else {
1417 /*
1418 * Kernel uses the source IP address to select the
1419 * interface for egress encapsulated traffic, so
1420 * it must be specified in the tc rule.
1421 */
1422 return rte_flow_error_set(error, EINVAL,
1423 RTE_FLOW_ERROR_TYPE_ITEM, item,
1424 "outer L3 source address"
1425 " must be specified for"
1426 " vxlan encapsulation");
1427 }
1428 msk6 = (rte_be_to_cpu_32(mask->hdr.vtc_flow) >>
1429 IPV6_HDR_TC_SHIFT) & 0xff;
1430 if (msk6 && msk6 != 0xff)
1431 return rte_flow_error_set(error, ENOTSUP,
1432 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1433 "no support for partial mask on"
1434 " \"ipv6.hdr.vtc_flow.tos\" field"
1435 " for vxlan encapsulation");
1436 if (mask->hdr.hop_limits && mask->hdr.hop_limits != 0xff)
1437 return rte_flow_error_set(error, ENOTSUP,
1438 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1439 "no support for partial mask on"
1440 " \"ipv6.hdr.hop_limits\" field"
1441 " for vxlan encapsulation");
1442 return 0;
1443 }
1444
1445 /**
1446 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_UDP item for E-Switch.
1447 * The routine checks the UDP fields to be used in encapsulation header.
1448 *
1449 * @param[in] item
1450 * Pointer to the item structure.
1451 * @param[out] error
1452 * Pointer to the error structure.
1453 *
1454 * @return
1455 * 0 on success, a negative errno value otherwise and rte_errno is set.
1456 **/
1457 static int
1458 flow_tcf_validate_vxlan_encap_udp(const struct rte_flow_item *item,
1459 struct rte_flow_error *error)
1460 {
1461 const struct rte_flow_item_udp *spec = item->spec;
1462 const struct rte_flow_item_udp *mask = item->mask;
1463
1464 if (!spec) {
1465 /*
1466 * Specification for UDP ports cannot be empty
1467 * because it is required by tunnel_key parameter.
1468 */
1469 return rte_flow_error_set(error, EINVAL,
1470 RTE_FLOW_ERROR_TYPE_ITEM, item,
1471 "NULL UDP port specification "
1472 " for vxlan encapsulation");
1473 }
1474 if (!mask)
1475 mask = &rte_flow_item_udp_mask;
1476 if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1477 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1478 return rte_flow_error_set
1479 (error, ENOTSUP,
1480 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1481 "no support for partial mask on"
1482 " \"udp.hdr.dst_port\" field"
1483 " for vxlan encapsulation");
1484 if (!spec->hdr.dst_port)
1485 return rte_flow_error_set
1486 (error, EINVAL,
1487 RTE_FLOW_ERROR_TYPE_ITEM, item,
1488 "outer UDP remote port cannot be"
1489 " 0 for vxlan encapsulation");
1490 } else {
1491 return rte_flow_error_set(error, EINVAL,
1492 RTE_FLOW_ERROR_TYPE_ITEM, item,
1493 "outer UDP remote port"
1494 " must be specified for"
1495 " vxlan encapsulation");
1496 }
1497 if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1498 if (mask->hdr.src_port != RTE_BE16(0xffff))
1499 return rte_flow_error_set
1500 (error, ENOTSUP,
1501 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1502 "no support for partial mask on"
1503 " \"udp.hdr.src_port\" field"
1504 " for vxlan encapsulation");
1505 DRV_LOG(WARNING,
1506 "outer UDP source port cannot be"
1507 " forced for vxlan encapsulation,"
1508 " parameter ignored");
1509 }
1510 return 0;
1511 }
1512
1513 /**
1514 * Validate VXLAN_ENCAP action RTE_FLOW_ITEM_TYPE_VXLAN item for E-Switch.
1515 * The routine checks the VNIP fields to be used in encapsulation header.
1516 *
1517 * @param[in] item
1518 * Pointer to the item structure.
1519 * @param[out] error
1520 * Pointer to the error structure.
1521 *
1522 * @return
1523 * 0 on success, a negative errno value otherwise and rte_errno is set.
1524 **/
1525 static int
1526 flow_tcf_validate_vxlan_encap_vni(const struct rte_flow_item *item,
1527 struct rte_flow_error *error)
1528 {
1529 const struct rte_flow_item_vxlan *spec = item->spec;
1530 const struct rte_flow_item_vxlan *mask = item->mask;
1531
1532 if (!spec) {
1533 /* Outer VNI is required by tunnel_key parameter. */
1534 return rte_flow_error_set(error, EINVAL,
1535 RTE_FLOW_ERROR_TYPE_ITEM, item,
1536 "NULL VNI specification"
1537 " for vxlan encapsulation");
1538 }
1539 if (!mask)
1540 mask = &rte_flow_item_vxlan_mask;
1541 if (!mask->vni[0] && !mask->vni[1] && !mask->vni[2])
1542 return rte_flow_error_set(error, EINVAL,
1543 RTE_FLOW_ERROR_TYPE_ITEM, item,
1544 "outer VNI must be specified "
1545 "for vxlan encapsulation");
1546 if (mask->vni[0] != 0xff ||
1547 mask->vni[1] != 0xff ||
1548 mask->vni[2] != 0xff)
1549 return rte_flow_error_set(error, ENOTSUP,
1550 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1551 "no support for partial mask on"
1552 " \"vxlan.vni\" field");
1553
1554 if (!spec->vni[0] && !spec->vni[1] && !spec->vni[2])
1555 return rte_flow_error_set(error, EINVAL,
1556 RTE_FLOW_ERROR_TYPE_ITEM, item,
1557 "vxlan vni cannot be 0");
1558 return 0;
1559 }
1560
1561 /**
1562 * Validate VXLAN_ENCAP action item list for E-Switch.
1563 * The routine checks items to be used in encapsulation header.
1564 *
1565 * @param[in] action
1566 * Pointer to the VXLAN_ENCAP action structure.
1567 * @param[out] error
1568 * Pointer to the error structure.
1569 *
1570 * @return
1571 * 0 on success, a negative errno value otherwise and rte_errno is set.
1572 **/
1573 static int
1574 flow_tcf_validate_vxlan_encap(const struct rte_flow_action *action,
1575 struct rte_flow_error *error)
1576 {
1577 const struct rte_flow_item *items;
1578 int ret;
1579 uint32_t item_flags = 0;
1580
1581 if (!action->conf)
1582 return rte_flow_error_set(error, EINVAL,
1583 RTE_FLOW_ERROR_TYPE_ACTION, action,
1584 "Missing vxlan tunnel"
1585 " action configuration");
1586 items = ((const struct rte_flow_action_vxlan_encap *)
1587 action->conf)->definition;
1588 if (!items)
1589 return rte_flow_error_set(error, EINVAL,
1590 RTE_FLOW_ERROR_TYPE_ACTION, action,
1591 "Missing vxlan tunnel"
1592 " encapsulation parameters");
1593 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1594 switch (items->type) {
1595 case RTE_FLOW_ITEM_TYPE_VOID:
1596 break;
1597 case RTE_FLOW_ITEM_TYPE_ETH:
1598 ret = mlx5_flow_validate_item_eth(items, item_flags,
1599 error);
1600 if (ret < 0)
1601 return ret;
1602 ret = flow_tcf_validate_vxlan_encap_eth(items, error);
1603 if (ret < 0)
1604 return ret;
1605 item_flags |= MLX5_FLOW_LAYER_OUTER_L2;
1606 break;
1607 break;
1608 case RTE_FLOW_ITEM_TYPE_IPV4:
1609 ret = mlx5_flow_validate_item_ipv4
1610 (items, item_flags,
1611 &flow_tcf_mask_supported.ipv4, error);
1612 if (ret < 0)
1613 return ret;
1614 ret = flow_tcf_validate_vxlan_encap_ipv4(items, error);
1615 if (ret < 0)
1616 return ret;
1617 item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
1618 break;
1619 case RTE_FLOW_ITEM_TYPE_IPV6:
1620 ret = mlx5_flow_validate_item_ipv6
1621 (items, item_flags,
1622 &flow_tcf_mask_supported.ipv6, error);
1623 if (ret < 0)
1624 return ret;
1625 ret = flow_tcf_validate_vxlan_encap_ipv6(items, error);
1626 if (ret < 0)
1627 return ret;
1628 item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
1629 break;
1630 case RTE_FLOW_ITEM_TYPE_UDP:
1631 ret = mlx5_flow_validate_item_udp(items, item_flags,
1632 0xFF, error);
1633 if (ret < 0)
1634 return ret;
1635 ret = flow_tcf_validate_vxlan_encap_udp(items, error);
1636 if (ret < 0)
1637 return ret;
1638 item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
1639 break;
1640 case RTE_FLOW_ITEM_TYPE_VXLAN:
1641 ret = mlx5_flow_validate_item_vxlan(items,
1642 item_flags, error);
1643 if (ret < 0)
1644 return ret;
1645 ret = flow_tcf_validate_vxlan_encap_vni(items, error);
1646 if (ret < 0)
1647 return ret;
1648 item_flags |= MLX5_FLOW_LAYER_VXLAN;
1649 break;
1650 default:
1651 return rte_flow_error_set
1652 (error, ENOTSUP,
1653 RTE_FLOW_ERROR_TYPE_ITEM, items,
1654 "vxlan encap item not supported");
1655 }
1656 }
1657 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3))
1658 return rte_flow_error_set(error, EINVAL,
1659 RTE_FLOW_ERROR_TYPE_ACTION, action,
1660 "no outer IP layer found"
1661 " for vxlan encapsulation");
1662 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
1663 return rte_flow_error_set(error, EINVAL,
1664 RTE_FLOW_ERROR_TYPE_ACTION, action,
1665 "no outer UDP layer found"
1666 " for vxlan encapsulation");
1667 if (!(item_flags & MLX5_FLOW_LAYER_VXLAN))
1668 return rte_flow_error_set(error, EINVAL,
1669 RTE_FLOW_ERROR_TYPE_ACTION, action,
1670 "no VXLAN VNI found"
1671 " for vxlan encapsulation");
1672 return 0;
1673 }
1674
1675 /**
1676 * Validate outer RTE_FLOW_ITEM_TYPE_UDP item if tunnel item
1677 * RTE_FLOW_ITEM_TYPE_VXLAN is present in item list.
1678 *
1679 * @param[in] udp
1680 * Outer UDP layer item (if any, NULL otherwise).
1681 * @param[out] error
1682 * Pointer to the error structure.
1683 *
1684 * @return
1685 * 0 on success, a negative errno value otherwise and rte_errno is set.
1686 **/
1687 static int
1688 flow_tcf_validate_vxlan_decap_udp(const struct rte_flow_item *udp,
1689 struct rte_flow_error *error)
1690 {
1691 const struct rte_flow_item_udp *spec = udp->spec;
1692 const struct rte_flow_item_udp *mask = udp->mask;
1693
1694 if (!spec)
1695 /*
1696 * Specification for UDP ports cannot be empty
1697 * because it is required as decap parameter.
1698 */
1699 return rte_flow_error_set(error, EINVAL,
1700 RTE_FLOW_ERROR_TYPE_ITEM, udp,
1701 "NULL UDP port specification"
1702 " for VXLAN decapsulation");
1703 if (!mask)
1704 mask = &rte_flow_item_udp_mask;
1705 if (mask->hdr.dst_port != RTE_BE16(0x0000)) {
1706 if (mask->hdr.dst_port != RTE_BE16(0xffff))
1707 return rte_flow_error_set
1708 (error, ENOTSUP,
1709 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1710 "no support for partial mask on"
1711 " \"udp.hdr.dst_port\" field");
1712 if (!spec->hdr.dst_port)
1713 return rte_flow_error_set
1714 (error, EINVAL,
1715 RTE_FLOW_ERROR_TYPE_ITEM, udp,
1716 "zero decap local UDP port");
1717 } else {
1718 return rte_flow_error_set(error, EINVAL,
1719 RTE_FLOW_ERROR_TYPE_ITEM, udp,
1720 "outer UDP destination port must be "
1721 "specified for vxlan decapsulation");
1722 }
1723 if (mask->hdr.src_port != RTE_BE16(0x0000)) {
1724 if (mask->hdr.src_port != RTE_BE16(0xffff))
1725 return rte_flow_error_set
1726 (error, ENOTSUP,
1727 RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
1728 "no support for partial mask on"
1729 " \"udp.hdr.src_port\" field");
1730 DRV_LOG(WARNING,
1731 "outer UDP local port cannot be "
1732 "forced for VXLAN encapsulation, "
1733 "parameter ignored");
1734 }
1735 return 0;
1736 }
1737
1738 /**
1739 * Validate flow for E-Switch.
1740 *
1741 * @param[in] priv
1742 * Pointer to the priv structure.
1743 * @param[in] attr
1744 * Pointer to the flow attributes.
1745 * @param[in] items
1746 * Pointer to the list of items.
1747 * @param[in] actions
1748 * Pointer to the list of actions.
1749 * @param[out] error
1750 * Pointer to the error structure.
1751 *
1752 * @return
1753 * 0 on success, a negative errno value otherwise and rte_errno is set.
1754 */
1755 static int
1756 flow_tcf_validate(struct rte_eth_dev *dev,
1757 const struct rte_flow_attr *attr,
1758 const struct rte_flow_item items[],
1759 const struct rte_flow_action actions[],
1760 struct rte_flow_error *error)
1761 {
1762 union {
1763 const struct rte_flow_item_port_id *port_id;
1764 const struct rte_flow_item_eth *eth;
1765 const struct rte_flow_item_vlan *vlan;
1766 const struct rte_flow_item_ipv4 *ipv4;
1767 const struct rte_flow_item_ipv6 *ipv6;
1768 const struct rte_flow_item_tcp *tcp;
1769 const struct rte_flow_item_udp *udp;
1770 const struct rte_flow_item_vxlan *vxlan;
1771 } spec, mask;
1772 union {
1773 const struct rte_flow_action_port_id *port_id;
1774 const struct rte_flow_action_jump *jump;
1775 const struct rte_flow_action_of_push_vlan *of_push_vlan;
1776 const struct rte_flow_action_of_set_vlan_vid *
1777 of_set_vlan_vid;
1778 const struct rte_flow_action_of_set_vlan_pcp *
1779 of_set_vlan_pcp;
1780 const struct rte_flow_action_vxlan_encap *vxlan_encap;
1781 const struct rte_flow_action_set_ipv4 *set_ipv4;
1782 const struct rte_flow_action_set_ipv6 *set_ipv6;
1783 } conf;
1784 const struct rte_flow_item *outer_udp = NULL;
1785 rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL);
1786 rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL);
1787 rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL);
1788 uint64_t item_flags = 0;
1789 uint64_t action_flags = 0;
1790 uint8_t next_protocol = 0xff;
1791 unsigned int tcm_ifindex = 0;
1792 uint8_t pedit_validated = 0;
1793 struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
1794 struct rte_eth_dev *port_id_dev = NULL;
1795 bool in_port_id_set;
1796 int ret;
1797
1798 claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
1799 PTOI_TABLE_SZ_MAX(dev)));
1800 ret = flow_tcf_validate_attributes(attr, error);
1801 if (ret < 0)
1802 return ret;
1803 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
1804 unsigned int i;
1805 uint64_t current_action_flag = 0;
1806
1807 switch (actions->type) {
1808 case RTE_FLOW_ACTION_TYPE_VOID:
1809 break;
1810 case RTE_FLOW_ACTION_TYPE_PORT_ID:
1811 current_action_flag = MLX5_FLOW_ACTION_PORT_ID;
1812 if (!actions->conf)
1813 break;
1814 conf.port_id = actions->conf;
1815 if (conf.port_id->original)
1816 i = 0;
1817 else
1818 for (i = 0; ptoi[i].ifindex; ++i)
1819 if (ptoi[i].port_id == conf.port_id->id)
1820 break;
1821 if (!ptoi[i].ifindex)
1822 return rte_flow_error_set
1823 (error, ENODEV,
1824 RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1825 conf.port_id,
1826 "missing data to convert port ID to"
1827 " ifindex");
1828 port_id_dev = &rte_eth_devices[conf.port_id->id];
1829 break;
1830 case RTE_FLOW_ACTION_TYPE_JUMP:
1831 current_action_flag = MLX5_FLOW_ACTION_JUMP;
1832 if (!actions->conf)
1833 break;
1834 conf.jump = actions->conf;
1835 if (attr->group >= conf.jump->group)
1836 return rte_flow_error_set
1837 (error, ENOTSUP,
1838 RTE_FLOW_ERROR_TYPE_ACTION,
1839 actions,
1840 "can jump only to a group forward");
1841 break;
1842 case RTE_FLOW_ACTION_TYPE_DROP:
1843 current_action_flag = MLX5_FLOW_ACTION_DROP;
1844 break;
1845 case RTE_FLOW_ACTION_TYPE_COUNT:
1846 break;
1847 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
1848 current_action_flag = MLX5_FLOW_ACTION_OF_POP_VLAN;
1849 break;
1850 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: {
1851 rte_be16_t ethertype;
1852
1853 current_action_flag = MLX5_FLOW_ACTION_OF_PUSH_VLAN;
1854 if (!actions->conf)
1855 break;
1856 conf.of_push_vlan = actions->conf;
1857 ethertype = conf.of_push_vlan->ethertype;
1858 if (ethertype != RTE_BE16(ETH_P_8021Q) &&
1859 ethertype != RTE_BE16(ETH_P_8021AD))
1860 return rte_flow_error_set
1861 (error, EINVAL,
1862 RTE_FLOW_ERROR_TYPE_ACTION, actions,
1863 "vlan push TPID must be "
1864 "802.1Q or 802.1AD");
1865 break;
1866 }
1867 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
1868 if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1869 return rte_flow_error_set
1870 (error, ENOTSUP,
1871 RTE_FLOW_ERROR_TYPE_ACTION, actions,
1872 "vlan modify is not supported,"
1873 " set action must follow push action");
1874 current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
1875 break;
1876 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
1877 if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN))
1878 return rte_flow_error_set
1879 (error, ENOTSUP,
1880 RTE_FLOW_ERROR_TYPE_ACTION, actions,
1881 "vlan modify is not supported,"
1882 " set action must follow push action");
1883 current_action_flag = MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
1884 break;
1885 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
1886 current_action_flag = MLX5_FLOW_ACTION_VXLAN_DECAP;
1887 break;
1888 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
1889 ret = flow_tcf_validate_vxlan_encap(actions, error);
1890 if (ret < 0)
1891 return ret;
1892 current_action_flag = MLX5_FLOW_ACTION_VXLAN_ENCAP;
1893 break;
1894 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
1895 current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_SRC;
1896 break;
1897 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
1898 current_action_flag = MLX5_FLOW_ACTION_SET_IPV4_DST;
1899 break;
1900 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
1901 current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_SRC;
1902 break;
1903 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
1904 current_action_flag = MLX5_FLOW_ACTION_SET_IPV6_DST;
1905 break;
1906 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
1907 current_action_flag = MLX5_FLOW_ACTION_SET_TP_SRC;
1908 break;
1909 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
1910 current_action_flag = MLX5_FLOW_ACTION_SET_TP_DST;
1911 break;
1912 case RTE_FLOW_ACTION_TYPE_SET_TTL:
1913 current_action_flag = MLX5_FLOW_ACTION_SET_TTL;
1914 break;
1915 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
1916 current_action_flag = MLX5_FLOW_ACTION_DEC_TTL;
1917 break;
1918 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
1919 current_action_flag = MLX5_FLOW_ACTION_SET_MAC_SRC;
1920 break;
1921 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
1922 current_action_flag = MLX5_FLOW_ACTION_SET_MAC_DST;
1923 break;
1924 default:
1925 return rte_flow_error_set(error, ENOTSUP,
1926 RTE_FLOW_ERROR_TYPE_ACTION,
1927 actions,
1928 "action not supported");
1929 }
1930 if (current_action_flag & MLX5_TCF_CONFIG_ACTIONS) {
1931 if (!actions->conf)
1932 return rte_flow_error_set
1933 (error, EINVAL,
1934 RTE_FLOW_ERROR_TYPE_ACTION_CONF,
1935 actions,
1936 "action configuration not set");
1937 }
1938 if ((current_action_flag & MLX5_TCF_PEDIT_ACTIONS) &&
1939 pedit_validated)
1940 return rte_flow_error_set(error, ENOTSUP,
1941 RTE_FLOW_ERROR_TYPE_ACTION,
1942 actions,
1943 "set actions should be "
1944 "listed successively");
1945 if ((current_action_flag & ~MLX5_TCF_PEDIT_ACTIONS) &&
1946 (action_flags & MLX5_TCF_PEDIT_ACTIONS))
1947 pedit_validated = 1;
1948 if ((current_action_flag & MLX5_TCF_FATE_ACTIONS) &&
1949 (action_flags & MLX5_TCF_FATE_ACTIONS))
1950 return rte_flow_error_set(error, EINVAL,
1951 RTE_FLOW_ERROR_TYPE_ACTION,
1952 actions,
1953 "can't have multiple fate"
1954 " actions");
1955 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1956 (action_flags & MLX5_TCF_VXLAN_ACTIONS))
1957 return rte_flow_error_set(error, EINVAL,
1958 RTE_FLOW_ERROR_TYPE_ACTION,
1959 actions,
1960 "can't have multiple vxlan"
1961 " actions");
1962 if ((current_action_flag & MLX5_TCF_VXLAN_ACTIONS) &&
1963 (action_flags & MLX5_TCF_VLAN_ACTIONS))
1964 return rte_flow_error_set(error, ENOTSUP,
1965 RTE_FLOW_ERROR_TYPE_ACTION,
1966 actions,
1967 "can't have vxlan and vlan"
1968 " actions in the same rule");
1969 action_flags |= current_action_flag;
1970 }
1971 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
1972 unsigned int i;
1973
1974 switch (items->type) {
1975 case RTE_FLOW_ITEM_TYPE_VOID:
1976 break;
1977 case RTE_FLOW_ITEM_TYPE_PORT_ID:
1978 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
1979 return rte_flow_error_set
1980 (error, ENOTSUP,
1981 RTE_FLOW_ERROR_TYPE_ITEM, items,
1982 "inner tunnel port id"
1983 " item is not supported");
1984 mask.port_id = flow_tcf_item_mask
1985 (items, &rte_flow_item_port_id_mask,
1986 &flow_tcf_mask_supported.port_id,
1987 &flow_tcf_mask_empty.port_id,
1988 sizeof(flow_tcf_mask_supported.port_id),
1989 error);
1990 if (!mask.port_id)
1991 return -rte_errno;
1992 if (mask.port_id == &flow_tcf_mask_empty.port_id) {
1993 in_port_id_set = 1;
1994 break;
1995 }
1996 spec.port_id = items->spec;
1997 if (mask.port_id->id && mask.port_id->id != 0xffffffff)
1998 return rte_flow_error_set
1999 (error, ENOTSUP,
2000 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2001 mask.port_id,
2002 "no support for partial mask on"
2003 " \"id\" field");
2004 if (!mask.port_id->id)
2005 i = 0;
2006 else
2007 for (i = 0; ptoi[i].ifindex; ++i)
2008 if (ptoi[i].port_id == spec.port_id->id)
2009 break;
2010 if (!ptoi[i].ifindex)
2011 return rte_flow_error_set
2012 (error, ENODEV,
2013 RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
2014 spec.port_id,
2015 "missing data to convert port ID to"
2016 " ifindex");
2017 if (in_port_id_set && ptoi[i].ifindex != tcm_ifindex)
2018 return rte_flow_error_set
2019 (error, ENOTSUP,
2020 RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
2021 spec.port_id,
2022 "cannot match traffic for"
2023 " several port IDs through"
2024 " a single flow rule");
2025 tcm_ifindex = ptoi[i].ifindex;
2026 in_port_id_set = 1;
2027 break;
2028 case RTE_FLOW_ITEM_TYPE_ETH:
2029 ret = mlx5_flow_validate_item_eth(items, item_flags,
2030 error);
2031 if (ret < 0)
2032 return ret;
2033 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2034 MLX5_FLOW_LAYER_INNER_L2 :
2035 MLX5_FLOW_LAYER_OUTER_L2;
2036 /* TODO:
2037 * Redundant check due to different supported mask.
2038 * Same for the rest of items.
2039 */
2040 mask.eth = flow_tcf_item_mask
2041 (items, &rte_flow_item_eth_mask,
2042 &flow_tcf_mask_supported.eth,
2043 &flow_tcf_mask_empty.eth,
2044 sizeof(flow_tcf_mask_supported.eth),
2045 error);
2046 if (!mask.eth)
2047 return -rte_errno;
2048 if (mask.eth->type && mask.eth->type !=
2049 RTE_BE16(0xffff))
2050 return rte_flow_error_set
2051 (error, ENOTSUP,
2052 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2053 mask.eth,
2054 "no support for partial mask on"
2055 " \"type\" field");
2056 assert(items->spec);
2057 spec.eth = items->spec;
2058 if (mask.eth->type &&
2059 (item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
2060 inner_etype != RTE_BE16(ETH_P_ALL) &&
2061 inner_etype != spec.eth->type)
2062 return rte_flow_error_set
2063 (error, EINVAL,
2064 RTE_FLOW_ERROR_TYPE_ITEM,
2065 items,
2066 "inner eth_type conflict");
2067 if (mask.eth->type &&
2068 !(item_flags & MLX5_FLOW_LAYER_TUNNEL) &&
2069 outer_etype != RTE_BE16(ETH_P_ALL) &&
2070 outer_etype != spec.eth->type)
2071 return rte_flow_error_set
2072 (error, EINVAL,
2073 RTE_FLOW_ERROR_TYPE_ITEM,
2074 items,
2075 "outer eth_type conflict");
2076 if (mask.eth->type) {
2077 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
2078 inner_etype = spec.eth->type;
2079 else
2080 outer_etype = spec.eth->type;
2081 }
2082 break;
2083 case RTE_FLOW_ITEM_TYPE_VLAN:
2084 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
2085 return rte_flow_error_set
2086 (error, ENOTSUP,
2087 RTE_FLOW_ERROR_TYPE_ITEM, items,
2088 "inner tunnel VLAN"
2089 " is not supported");
2090 ret = mlx5_flow_validate_item_vlan(items, item_flags,
2091 error);
2092 if (ret < 0)
2093 return ret;
2094 item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
2095 mask.vlan = flow_tcf_item_mask
2096 (items, &rte_flow_item_vlan_mask,
2097 &flow_tcf_mask_supported.vlan,
2098 &flow_tcf_mask_empty.vlan,
2099 sizeof(flow_tcf_mask_supported.vlan),
2100 error);
2101 if (!mask.vlan)
2102 return -rte_errno;
2103 if ((mask.vlan->tci & RTE_BE16(0xe000) &&
2104 (mask.vlan->tci & RTE_BE16(0xe000)) !=
2105 RTE_BE16(0xe000)) ||
2106 (mask.vlan->tci & RTE_BE16(0x0fff) &&
2107 (mask.vlan->tci & RTE_BE16(0x0fff)) !=
2108 RTE_BE16(0x0fff)) ||
2109 (mask.vlan->inner_type &&
2110 mask.vlan->inner_type != RTE_BE16(0xffff)))
2111 return rte_flow_error_set
2112 (error, ENOTSUP,
2113 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2114 mask.vlan,
2115 "no support for partial masks on"
2116 " \"tci\" (PCP and VID parts) and"
2117 " \"inner_type\" fields");
2118 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2119 outer_etype != RTE_BE16(ETH_P_8021Q))
2120 return rte_flow_error_set
2121 (error, EINVAL,
2122 RTE_FLOW_ERROR_TYPE_ITEM,
2123 items,
2124 "outer eth_type conflict,"
2125 " must be 802.1Q");
2126 outer_etype = RTE_BE16(ETH_P_8021Q);
2127 assert(items->spec);
2128 spec.vlan = items->spec;
2129 if (mask.vlan->inner_type &&
2130 vlan_etype != RTE_BE16(ETH_P_ALL) &&
2131 vlan_etype != spec.vlan->inner_type)
2132 return rte_flow_error_set
2133 (error, EINVAL,
2134 RTE_FLOW_ERROR_TYPE_ITEM,
2135 items,
2136 "vlan eth_type conflict");
2137 if (mask.vlan->inner_type)
2138 vlan_etype = spec.vlan->inner_type;
2139 break;
2140 case RTE_FLOW_ITEM_TYPE_IPV4:
2141 ret = mlx5_flow_validate_item_ipv4
2142 (items, item_flags,
2143 &flow_tcf_mask_supported.ipv4, error);
2144 if (ret < 0)
2145 return ret;
2146 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2147 MLX5_FLOW_LAYER_INNER_L3_IPV4 :
2148 MLX5_FLOW_LAYER_OUTER_L3_IPV4;
2149 mask.ipv4 = flow_tcf_item_mask
2150 (items, &rte_flow_item_ipv4_mask,
2151 &flow_tcf_mask_supported.ipv4,
2152 &flow_tcf_mask_empty.ipv4,
2153 sizeof(flow_tcf_mask_supported.ipv4),
2154 error);
2155 if (!mask.ipv4)
2156 return -rte_errno;
2157 if (mask.ipv4->hdr.next_proto_id &&
2158 mask.ipv4->hdr.next_proto_id != 0xff)
2159 return rte_flow_error_set
2160 (error, ENOTSUP,
2161 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2162 mask.ipv4,
2163 "no support for partial mask on"
2164 " \"hdr.next_proto_id\" field");
2165 else if (mask.ipv4->hdr.next_proto_id)
2166 next_protocol =
2167 ((const struct rte_flow_item_ipv4 *)
2168 (items->spec))->hdr.next_proto_id;
2169 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
2170 if (inner_etype != RTE_BE16(ETH_P_ALL) &&
2171 inner_etype != RTE_BE16(ETH_P_IP))
2172 return rte_flow_error_set
2173 (error, EINVAL,
2174 RTE_FLOW_ERROR_TYPE_ITEM,
2175 items,
2176 "inner eth_type conflict,"
2177 " IPv4 is required");
2178 inner_etype = RTE_BE16(ETH_P_IP);
2179 } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) {
2180 if (vlan_etype != RTE_BE16(ETH_P_ALL) &&
2181 vlan_etype != RTE_BE16(ETH_P_IP))
2182 return rte_flow_error_set
2183 (error, EINVAL,
2184 RTE_FLOW_ERROR_TYPE_ITEM,
2185 items,
2186 "vlan eth_type conflict,"
2187 " IPv4 is required");
2188 vlan_etype = RTE_BE16(ETH_P_IP);
2189 } else {
2190 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2191 outer_etype != RTE_BE16(ETH_P_IP))
2192 return rte_flow_error_set
2193 (error, EINVAL,
2194 RTE_FLOW_ERROR_TYPE_ITEM,
2195 items,
2196 "eth_type conflict,"
2197 " IPv4 is required");
2198 outer_etype = RTE_BE16(ETH_P_IP);
2199 }
2200 break;
2201 case RTE_FLOW_ITEM_TYPE_IPV6:
2202 ret = mlx5_flow_validate_item_ipv6
2203 (items, item_flags,
2204 &flow_tcf_mask_supported.ipv6, error);
2205 if (ret < 0)
2206 return ret;
2207 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2208 MLX5_FLOW_LAYER_INNER_L3_IPV6 :
2209 MLX5_FLOW_LAYER_OUTER_L3_IPV6;
2210 mask.ipv6 = flow_tcf_item_mask
2211 (items, &rte_flow_item_ipv6_mask,
2212 &flow_tcf_mask_supported.ipv6,
2213 &flow_tcf_mask_empty.ipv6,
2214 sizeof(flow_tcf_mask_supported.ipv6),
2215 error);
2216 if (!mask.ipv6)
2217 return -rte_errno;
2218 if (mask.ipv6->hdr.proto &&
2219 mask.ipv6->hdr.proto != 0xff)
2220 return rte_flow_error_set
2221 (error, ENOTSUP,
2222 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2223 mask.ipv6,
2224 "no support for partial mask on"
2225 " \"hdr.proto\" field");
2226 else if (mask.ipv6->hdr.proto)
2227 next_protocol =
2228 ((const struct rte_flow_item_ipv6 *)
2229 (items->spec))->hdr.proto;
2230 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
2231 if (inner_etype != RTE_BE16(ETH_P_ALL) &&
2232 inner_etype != RTE_BE16(ETH_P_IPV6))
2233 return rte_flow_error_set
2234 (error, EINVAL,
2235 RTE_FLOW_ERROR_TYPE_ITEM,
2236 items,
2237 "inner eth_type conflict,"
2238 " IPv6 is required");
2239 inner_etype = RTE_BE16(ETH_P_IPV6);
2240 } else if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN) {
2241 if (vlan_etype != RTE_BE16(ETH_P_ALL) &&
2242 vlan_etype != RTE_BE16(ETH_P_IPV6))
2243 return rte_flow_error_set
2244 (error, EINVAL,
2245 RTE_FLOW_ERROR_TYPE_ITEM,
2246 items,
2247 "vlan eth_type conflict,"
2248 " IPv6 is required");
2249 vlan_etype = RTE_BE16(ETH_P_IPV6);
2250 } else {
2251 if (outer_etype != RTE_BE16(ETH_P_ALL) &&
2252 outer_etype != RTE_BE16(ETH_P_IPV6))
2253 return rte_flow_error_set
2254 (error, EINVAL,
2255 RTE_FLOW_ERROR_TYPE_ITEM,
2256 items,
2257 "eth_type conflict,"
2258 " IPv6 is required");
2259 outer_etype = RTE_BE16(ETH_P_IPV6);
2260 }
2261 break;
2262 case RTE_FLOW_ITEM_TYPE_UDP:
2263 ret = mlx5_flow_validate_item_udp(items, item_flags,
2264 next_protocol, error);
2265 if (ret < 0)
2266 return ret;
2267 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2268 MLX5_FLOW_LAYER_INNER_L4_UDP :
2269 MLX5_FLOW_LAYER_OUTER_L4_UDP;
2270 mask.udp = flow_tcf_item_mask
2271 (items, &rte_flow_item_udp_mask,
2272 &flow_tcf_mask_supported.udp,
2273 &flow_tcf_mask_empty.udp,
2274 sizeof(flow_tcf_mask_supported.udp),
2275 error);
2276 if (!mask.udp)
2277 return -rte_errno;
2278 /*
2279 * Save the presumed outer UDP item for extra check
2280 * if the tunnel item will be found later in the list.
2281 */
2282 if (!(item_flags & MLX5_FLOW_LAYER_TUNNEL))
2283 outer_udp = items;
2284 break;
2285 case RTE_FLOW_ITEM_TYPE_TCP:
2286 ret = mlx5_flow_validate_item_tcp
2287 (items, item_flags,
2288 next_protocol,
2289 &flow_tcf_mask_supported.tcp,
2290 error);
2291 if (ret < 0)
2292 return ret;
2293 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
2294 MLX5_FLOW_LAYER_INNER_L4_TCP :
2295 MLX5_FLOW_LAYER_OUTER_L4_TCP;
2296 mask.tcp = flow_tcf_item_mask
2297 (items, &rte_flow_item_tcp_mask,
2298 &flow_tcf_mask_supported.tcp,
2299 &flow_tcf_mask_empty.tcp,
2300 sizeof(flow_tcf_mask_supported.tcp),
2301 error);
2302 if (!mask.tcp)
2303 return -rte_errno;
2304 break;
2305 case RTE_FLOW_ITEM_TYPE_VXLAN:
2306 if (item_flags & MLX5_FLOW_LAYER_OUTER_VLAN)
2307 return rte_flow_error_set
2308 (error, ENOTSUP,
2309 RTE_FLOW_ERROR_TYPE_ITEM, items,
2310 "vxlan tunnel over vlan"
2311 " is not supported");
2312 ret = mlx5_flow_validate_item_vxlan(items,
2313 item_flags, error);
2314 if (ret < 0)
2315 return ret;
2316 item_flags |= MLX5_FLOW_LAYER_VXLAN;
2317 mask.vxlan = flow_tcf_item_mask
2318 (items, &rte_flow_item_vxlan_mask,
2319 &flow_tcf_mask_supported.vxlan,
2320 &flow_tcf_mask_empty.vxlan,
2321 sizeof(flow_tcf_mask_supported.vxlan), error);
2322 if (!mask.vxlan)
2323 return -rte_errno;
2324 if (mask.vxlan->vni[0] != 0xff ||
2325 mask.vxlan->vni[1] != 0xff ||
2326 mask.vxlan->vni[2] != 0xff)
2327 return rte_flow_error_set
2328 (error, ENOTSUP,
2329 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
2330 mask.vxlan,
2331 "no support for partial or "
2332 "empty mask on \"vxlan.vni\" field");
2333 /*
2334 * The VNI item assumes the VXLAN tunnel, it requires
2335 * at least the outer destination UDP port must be
2336 * specified without wildcards to allow kernel select
2337 * the virtual VXLAN device by port. Also outer IPv4
2338 * or IPv6 item must be specified (wilcards or even
2339 * zero mask are allowed) to let driver know the tunnel
2340 * IP version and process UDP traffic correctly.
2341 */
2342 if (!(item_flags &
2343 (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2344 MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2345 return rte_flow_error_set
2346 (error, EINVAL,
2347 RTE_FLOW_ERROR_TYPE_ACTION,
2348 NULL,
2349 "no outer IP pattern found"
2350 " for vxlan tunnel");
2351 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_UDP))
2352 return rte_flow_error_set
2353 (error, EINVAL,
2354 RTE_FLOW_ERROR_TYPE_ACTION,
2355 NULL,
2356 "no outer UDP pattern found"
2357 " for vxlan tunnel");
2358 /*
2359 * All items preceding the tunnel item become outer
2360 * ones and we should do extra validation for them
2361 * due to tc limitations for tunnel outer parameters.
2362 * Currently only outer UDP item requres extra check,
2363 * use the saved pointer instead of item list rescan.
2364 */
2365 assert(outer_udp);
2366 ret = flow_tcf_validate_vxlan_decap_udp
2367 (outer_udp, error);
2368 if (ret < 0)
2369 return ret;
2370 /* Reset L4 protocol for inner parameters. */
2371 next_protocol = 0xff;
2372 break;
2373 default:
2374 return rte_flow_error_set(error, ENOTSUP,
2375 RTE_FLOW_ERROR_TYPE_ITEM,
2376 items, "item not supported");
2377 }
2378 }
2379 if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2380 (action_flags & MLX5_FLOW_ACTION_DROP))
2381 return rte_flow_error_set(error, ENOTSUP,
2382 RTE_FLOW_ERROR_TYPE_ACTION,
2383 actions,
2384 "set action is not compatible with "
2385 "drop action");
2386 if ((action_flags & MLX5_TCF_PEDIT_ACTIONS) &&
2387 !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2388 return rte_flow_error_set(error, ENOTSUP,
2389 RTE_FLOW_ERROR_TYPE_ACTION,
2390 actions,
2391 "set action must be followed by "
2392 "port_id action");
2393 if (action_flags &
2394 (MLX5_FLOW_ACTION_SET_IPV4_SRC | MLX5_FLOW_ACTION_SET_IPV4_DST)) {
2395 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV4))
2396 return rte_flow_error_set(error, EINVAL,
2397 RTE_FLOW_ERROR_TYPE_ACTION,
2398 actions,
2399 "no ipv4 item found in"
2400 " pattern");
2401 }
2402 if (action_flags &
2403 (MLX5_FLOW_ACTION_SET_IPV6_SRC | MLX5_FLOW_ACTION_SET_IPV6_DST)) {
2404 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L3_IPV6))
2405 return rte_flow_error_set(error, EINVAL,
2406 RTE_FLOW_ERROR_TYPE_ACTION,
2407 actions,
2408 "no ipv6 item found in"
2409 " pattern");
2410 }
2411 if (action_flags &
2412 (MLX5_FLOW_ACTION_SET_TP_SRC | MLX5_FLOW_ACTION_SET_TP_DST)) {
2413 if (!(item_flags &
2414 (MLX5_FLOW_LAYER_OUTER_L4_UDP |
2415 MLX5_FLOW_LAYER_OUTER_L4_TCP)))
2416 return rte_flow_error_set(error, EINVAL,
2417 RTE_FLOW_ERROR_TYPE_ACTION,
2418 actions,
2419 "no TCP/UDP item found in"
2420 " pattern");
2421 }
2422 /*
2423 * FW syndrome (0xA9C090):
2424 * set_flow_table_entry: push vlan action fte in fdb can ONLY be
2425 * forward to the uplink.
2426 */
2427 if ((action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) &&
2428 (action_flags & MLX5_FLOW_ACTION_PORT_ID) &&
2429 ((struct mlx5_priv *)port_id_dev->data->dev_private)->representor)
2430 return rte_flow_error_set(error, ENOTSUP,
2431 RTE_FLOW_ERROR_TYPE_ACTION, actions,
2432 "vlan push can only be applied"
2433 " when forwarding to uplink port");
2434 /*
2435 * FW syndrome (0x294609):
2436 * set_flow_table_entry: modify/pop/push actions in fdb flow table
2437 * are supported only while forwarding to vport.
2438 */
2439 if ((action_flags & MLX5_TCF_VLAN_ACTIONS) &&
2440 !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2441 return rte_flow_error_set(error, ENOTSUP,
2442 RTE_FLOW_ERROR_TYPE_ACTION, actions,
2443 "vlan actions are supported"
2444 " only with port_id action");
2445 if ((action_flags & MLX5_TCF_VXLAN_ACTIONS) &&
2446 !(action_flags & MLX5_FLOW_ACTION_PORT_ID))
2447 return rte_flow_error_set(error, ENOTSUP,
2448 RTE_FLOW_ERROR_TYPE_ACTION, NULL,
2449 "vxlan actions are supported"
2450 " only with port_id action");
2451 if (!(action_flags & MLX5_TCF_FATE_ACTIONS))
2452 return rte_flow_error_set(error, EINVAL,
2453 RTE_FLOW_ERROR_TYPE_ACTION, actions,
2454 "no fate action is found");
2455 if (action_flags &
2456 (MLX5_FLOW_ACTION_SET_TTL | MLX5_FLOW_ACTION_DEC_TTL)) {
2457 if (!(item_flags &
2458 (MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
2459 MLX5_FLOW_LAYER_OUTER_L3_IPV6)))
2460 return rte_flow_error_set(error, EINVAL,
2461 RTE_FLOW_ERROR_TYPE_ACTION,
2462 actions,
2463 "no IP found in pattern");
2464 }
2465 if (action_flags &
2466 (MLX5_FLOW_ACTION_SET_MAC_SRC | MLX5_FLOW_ACTION_SET_MAC_DST)) {
2467 if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L2))
2468 return rte_flow_error_set(error, ENOTSUP,
2469 RTE_FLOW_ERROR_TYPE_ACTION,
2470 actions,
2471 "no ethernet found in"
2472 " pattern");
2473 }
2474 if ((action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) &&
2475 !(item_flags & MLX5_FLOW_LAYER_VXLAN))
2476 return rte_flow_error_set(error, EINVAL,
2477 RTE_FLOW_ERROR_TYPE_ACTION,
2478 NULL,
2479 "no VNI pattern found"
2480 " for vxlan decap action");
2481 if ((action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) &&
2482 (item_flags & MLX5_FLOW_LAYER_TUNNEL))
2483 return rte_flow_error_set(error, EINVAL,
2484 RTE_FLOW_ERROR_TYPE_ACTION,
2485 NULL,
2486 "vxlan encap not supported"
2487 " for tunneled traffic");
2488 return 0;
2489 }
2490
2491 /**
2492 * Calculate maximum size of memory for flow items of Linux TC flower.
2493 *
2494 * @param[in] attr
2495 * Pointer to the flow attributes.
2496 * @param[in] items
2497 * Pointer to the list of items.
2498 * @param[out] action_flags
2499 * Pointer to the detected actions.
2500 *
2501 * @return
2502 * Maximum size of memory for items.
2503 */
2504 static int
2505 flow_tcf_get_items_size(const struct rte_flow_attr *attr,
2506 const struct rte_flow_item items[],
2507 uint64_t *action_flags)
2508 {
2509 int size = 0;
2510
2511 size += SZ_NLATTR_STRZ_OF("flower") +
2512 SZ_NLATTR_TYPE_OF(uint16_t) + /* Outer ether type. */
2513 SZ_NLATTR_NEST + /* TCA_OPTIONS. */
2514 SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CLS_FLAGS_SKIP_SW. */
2515 if (attr->group > 0)
2516 size += SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CHAIN. */
2517 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2518 switch (items->type) {
2519 case RTE_FLOW_ITEM_TYPE_VOID:
2520 break;
2521 case RTE_FLOW_ITEM_TYPE_PORT_ID:
2522 break;
2523 case RTE_FLOW_ITEM_TYPE_ETH:
2524 size += SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) * 4;
2525 /* dst/src MAC addr and mask. */
2526 break;
2527 case RTE_FLOW_ITEM_TYPE_VLAN:
2528 size += SZ_NLATTR_TYPE_OF(uint16_t) +
2529 /* VLAN Ether type. */
2530 SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */
2531 SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */
2532 break;
2533 case RTE_FLOW_ITEM_TYPE_IPV4: {
2534 const struct rte_flow_item_ipv4 *ipv4 = items->mask;
2535
2536 size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2537 SZ_NLATTR_TYPE_OF(uint32_t) * 4;
2538 /* dst/src IP addr and mask. */
2539 if (ipv4 && ipv4->hdr.time_to_live)
2540 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2541 if (ipv4 && ipv4->hdr.type_of_service)
2542 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2543 break;
2544 }
2545 case RTE_FLOW_ITEM_TYPE_IPV6: {
2546 const struct rte_flow_item_ipv6 *ipv6 = items->mask;
2547
2548 size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2549 SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 4;
2550 /* dst/src IP addr and mask. */
2551 if (ipv6 && ipv6->hdr.hop_limits)
2552 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2553 if (ipv6 && (rte_be_to_cpu_32(ipv6->hdr.vtc_flow) &
2554 (0xfful << IPV6_HDR_TC_SHIFT)))
2555 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2556 break;
2557 }
2558 case RTE_FLOW_ITEM_TYPE_UDP:
2559 size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2560 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2561 /* dst/src port and mask. */
2562 break;
2563 case RTE_FLOW_ITEM_TYPE_TCP:
2564 size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
2565 SZ_NLATTR_TYPE_OF(uint16_t) * 4;
2566 /* dst/src port and mask. */
2567 break;
2568 case RTE_FLOW_ITEM_TYPE_VXLAN:
2569 size += SZ_NLATTR_TYPE_OF(uint32_t);
2570 /*
2571 * There might be no VXLAN decap action in the action
2572 * list, nonetheless the VXLAN tunnel flow requires
2573 * the decap structure to be correctly applied to
2574 * VXLAN device, set the flag to create the structure.
2575 * Translation routine will not put the decap action
2576 * in tne Netlink message if there is no actual action
2577 * in the list.
2578 */
2579 *action_flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2580 break;
2581 default:
2582 DRV_LOG(WARNING,
2583 "unsupported item %p type %d,"
2584 " items must be validated before flow creation",
2585 (const void *)items, items->type);
2586 break;
2587 }
2588 }
2589 return size;
2590 }
2591
2592 /**
2593 * Calculate size of memory to store the VXLAN encapsultion
2594 * related items in the Netlink message buffer. Items list
2595 * is specified by RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action.
2596 * The item list should be validated.
2597 *
2598 * @param[in] action
2599 * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
2600 * List of pattern items to scan data from.
2601 *
2602 * @return
2603 * The size the part of Netlink message buffer to store the
2604 * VXLAN encapsulation item attributes.
2605 */
2606 static int
2607 flow_tcf_vxlan_encap_size(const struct rte_flow_action *action)
2608 {
2609 const struct rte_flow_item *items;
2610 int size = 0;
2611
2612 assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
2613 assert(action->conf);
2614
2615 items = ((const struct rte_flow_action_vxlan_encap *)
2616 action->conf)->definition;
2617 assert(items);
2618 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
2619 switch (items->type) {
2620 case RTE_FLOW_ITEM_TYPE_VOID:
2621 break;
2622 case RTE_FLOW_ITEM_TYPE_ETH:
2623 /* This item does not require message buffer. */
2624 break;
2625 case RTE_FLOW_ITEM_TYPE_IPV4: {
2626 const struct rte_flow_item_ipv4 *ipv4 = items->mask;
2627
2628 size += SZ_NLATTR_DATA_OF(IPV4_ADDR_LEN) * 2;
2629 if (ipv4 && ipv4->hdr.time_to_live)
2630 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2631 if (ipv4 && ipv4->hdr.type_of_service)
2632 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2633 break;
2634 }
2635 case RTE_FLOW_ITEM_TYPE_IPV6: {
2636 const struct rte_flow_item_ipv6 *ipv6 = items->mask;
2637
2638 size += SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 2;
2639 if (ipv6 && ipv6->hdr.hop_limits)
2640 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2641 if (ipv6 && (rte_be_to_cpu_32(ipv6->hdr.vtc_flow) &
2642 (0xfful << IPV6_HDR_TC_SHIFT)))
2643 size += SZ_NLATTR_TYPE_OF(uint8_t) * 2;
2644 break;
2645 }
2646 case RTE_FLOW_ITEM_TYPE_UDP: {
2647 const struct rte_flow_item_udp *udp = items->mask;
2648
2649 size += SZ_NLATTR_TYPE_OF(uint16_t);
2650 if (!udp || udp->hdr.src_port != RTE_BE16(0x0000))
2651 size += SZ_NLATTR_TYPE_OF(uint16_t);
2652 break;
2653 }
2654 case RTE_FLOW_ITEM_TYPE_VXLAN:
2655 size += SZ_NLATTR_TYPE_OF(uint32_t);
2656 break;
2657 default:
2658 assert(false);
2659 DRV_LOG(WARNING,
2660 "unsupported item %p type %d,"
2661 " items must be validated"
2662 " before flow creation",
2663 (const void *)items, items->type);
2664 return 0;
2665 }
2666 }
2667 return size;
2668 }
2669
2670 /**
2671 * Calculate maximum size of memory for flow actions of Linux TC flower and
2672 * extract specified actions.
2673 *
2674 * @param[in] actions
2675 * Pointer to the list of actions.
2676 * @param[out] action_flags
2677 * Pointer to the detected actions.
2678 *
2679 * @return
2680 * Maximum size of memory for actions.
2681 */
2682 static int
2683 flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
2684 uint64_t *action_flags)
2685 {
2686 int size = 0;
2687 uint64_t flags = *action_flags;
2688
2689 size += SZ_NLATTR_NEST; /* TCA_FLOWER_ACT. */
2690 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
2691 switch (actions->type) {
2692 case RTE_FLOW_ACTION_TYPE_VOID:
2693 break;
2694 case RTE_FLOW_ACTION_TYPE_PORT_ID:
2695 size += SZ_NLATTR_NEST + /* na_act_index. */
2696 SZ_NLATTR_STRZ_OF("mirred") +
2697 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2698 SZ_NLATTR_TYPE_OF(struct tc_mirred);
2699 flags |= MLX5_FLOW_ACTION_PORT_ID;
2700 break;
2701 case RTE_FLOW_ACTION_TYPE_JUMP:
2702 size += SZ_NLATTR_NEST + /* na_act_index. */
2703 SZ_NLATTR_STRZ_OF("gact") +
2704 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2705 SZ_NLATTR_TYPE_OF(struct tc_gact);
2706 flags |= MLX5_FLOW_ACTION_JUMP;
2707 break;
2708 case RTE_FLOW_ACTION_TYPE_DROP:
2709 size += SZ_NLATTR_NEST + /* na_act_index. */
2710 SZ_NLATTR_STRZ_OF("gact") +
2711 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2712 SZ_NLATTR_TYPE_OF(struct tc_gact);
2713 flags |= MLX5_FLOW_ACTION_DROP;
2714 break;
2715 case RTE_FLOW_ACTION_TYPE_COUNT:
2716 break;
2717 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
2718 flags |= MLX5_FLOW_ACTION_OF_POP_VLAN;
2719 goto action_of_vlan;
2720 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
2721 flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
2722 goto action_of_vlan;
2723 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
2724 flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID;
2725 goto action_of_vlan;
2726 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
2727 flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_PCP;
2728 goto action_of_vlan;
2729 action_of_vlan:
2730 size += SZ_NLATTR_NEST + /* na_act_index. */
2731 SZ_NLATTR_STRZ_OF("vlan") +
2732 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2733 SZ_NLATTR_TYPE_OF(struct tc_vlan) +
2734 SZ_NLATTR_TYPE_OF(uint16_t) +
2735 /* VLAN protocol. */
2736 SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID. */
2737 SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio. */
2738 break;
2739 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
2740 size += SZ_NLATTR_NEST + /* na_act_index. */
2741 SZ_NLATTR_STRZ_OF("tunnel_key") +
2742 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2743 SZ_NLATTR_TYPE_OF(uint8_t);
2744 size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2745 size += flow_tcf_vxlan_encap_size(actions) +
2746 RTE_ALIGN_CEIL /* preceding encap params. */
2747 (sizeof(struct flow_tcf_vxlan_encap),
2748 MNL_ALIGNTO);
2749 flags |= MLX5_FLOW_ACTION_VXLAN_ENCAP;
2750 break;
2751 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
2752 size += SZ_NLATTR_NEST + /* na_act_index. */
2753 SZ_NLATTR_STRZ_OF("tunnel_key") +
2754 SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
2755 SZ_NLATTR_TYPE_OF(uint8_t);
2756 size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
2757 size += RTE_ALIGN_CEIL /* preceding decap params. */
2758 (sizeof(struct flow_tcf_vxlan_decap),
2759 MNL_ALIGNTO);
2760 flags |= MLX5_FLOW_ACTION_VXLAN_DECAP;
2761 break;
2762 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
2763 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
2764 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
2765 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
2766 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
2767 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
2768 case RTE_FLOW_ACTION_TYPE_SET_TTL:
2769 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
2770 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
2771 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
2772 size += flow_tcf_get_pedit_actions_size(&actions,
2773 &flags);
2774 break;
2775 default:
2776 DRV_LOG(WARNING,
2777 "unsupported action %p type %d,"
2778 " items must be validated before flow creation",
2779 (const void *)actions, actions->type);
2780 break;
2781 }
2782 }
2783 *action_flags = flags;
2784 return size;
2785 }
2786
2787 /**
2788 * Prepare a flow object for Linux TC flower. It calculates the maximum size of
2789 * memory required, allocates the memory, initializes Netlink message headers
2790 * and set unique TC message handle.
2791 *
2792 * @param[in] attr
2793 * Pointer to the flow attributes.
2794 * @param[in] items
2795 * Pointer to the list of items.
2796 * @param[in] actions
2797 * Pointer to the list of actions.
2798 * @param[out] error
2799 * Pointer to the error structure.
2800 *
2801 * @return
2802 * Pointer to mlx5_flow object on success,
2803 * otherwise NULL and rte_errno is set.
2804 */
2805 static struct mlx5_flow *
2806 flow_tcf_prepare(const struct rte_flow_attr *attr,
2807 const struct rte_flow_item items[],
2808 const struct rte_flow_action actions[],
2809 struct rte_flow_error *error)
2810 {
2811 size_t size = RTE_ALIGN_CEIL
2812 (sizeof(struct mlx5_flow),
2813 alignof(struct flow_tcf_tunnel_hdr)) +
2814 MNL_ALIGN(sizeof(struct nlmsghdr)) +
2815 MNL_ALIGN(sizeof(struct tcmsg));
2816 struct mlx5_flow *dev_flow;
2817 uint64_t action_flags = 0;
2818 struct nlmsghdr *nlh;
2819 struct tcmsg *tcm;
2820 uint8_t *sp, *tun = NULL;
2821
2822 size += flow_tcf_get_items_size(attr, items, &action_flags);
2823 size += flow_tcf_get_actions_and_size(actions, &action_flags);
2824 dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
2825 if (!dev_flow) {
2826 rte_flow_error_set(error, ENOMEM,
2827 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
2828 "not enough memory to create E-Switch flow");
2829 return NULL;
2830 }
2831 sp = (uint8_t *)(dev_flow + 1);
2832 if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP) {
2833 sp = RTE_PTR_ALIGN
2834 (sp, alignof(struct flow_tcf_tunnel_hdr));
2835 tun = sp;
2836 sp += RTE_ALIGN_CEIL
2837 (sizeof(struct flow_tcf_vxlan_encap),
2838 MNL_ALIGNTO);
2839 #ifndef NDEBUG
2840 size -= RTE_ALIGN_CEIL
2841 (sizeof(struct flow_tcf_vxlan_encap),
2842 MNL_ALIGNTO);
2843 #endif
2844 } else if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP) {
2845 sp = RTE_PTR_ALIGN
2846 (sp, alignof(struct flow_tcf_tunnel_hdr));
2847 tun = sp;
2848 sp += RTE_ALIGN_CEIL
2849 (sizeof(struct flow_tcf_vxlan_decap),
2850 MNL_ALIGNTO);
2851 #ifndef NDEBUG
2852 size -= RTE_ALIGN_CEIL
2853 (sizeof(struct flow_tcf_vxlan_decap),
2854 MNL_ALIGNTO);
2855 #endif
2856 } else {
2857 sp = RTE_PTR_ALIGN(sp, MNL_ALIGNTO);
2858 }
2859 nlh = mnl_nlmsg_put_header(sp);
2860 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
2861 *dev_flow = (struct mlx5_flow){
2862 .tcf = (struct mlx5_flow_tcf){
2863 #ifndef NDEBUG
2864 .nlsize = size - RTE_ALIGN_CEIL
2865 (sizeof(struct mlx5_flow),
2866 alignof(struct flow_tcf_tunnel_hdr)),
2867 #endif
2868 .tunnel = (struct flow_tcf_tunnel_hdr *)tun,
2869 .nlh = nlh,
2870 .tcm = tcm,
2871 },
2872 };
2873 if (action_flags & MLX5_FLOW_ACTION_VXLAN_DECAP)
2874 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_DECAP;
2875 else if (action_flags & MLX5_FLOW_ACTION_VXLAN_ENCAP)
2876 dev_flow->tcf.tunnel->type = FLOW_TCF_TUNACT_VXLAN_ENCAP;
2877 return dev_flow;
2878 }
2879
2880 /**
2881 * Make adjustments for supporting count actions.
2882 *
2883 * @param[in] dev
2884 * Pointer to the Ethernet device structure.
2885 * @param[in] dev_flow
2886 * Pointer to mlx5_flow.
2887 * @param[out] error
2888 * Pointer to error structure.
2889 *
2890 * @return
2891 * 0 On success else a negative errno value is returned and rte_errno is set.
2892 */
2893 static int
2894 flow_tcf_translate_action_count(struct rte_eth_dev *dev __rte_unused,
2895 struct mlx5_flow *dev_flow,
2896 struct rte_flow_error *error)
2897 {
2898 struct rte_flow *flow = dev_flow->flow;
2899
2900 if (!flow->counter) {
2901 flow->counter = flow_tcf_counter_new();
2902 if (!flow->counter)
2903 return rte_flow_error_set(error, rte_errno,
2904 RTE_FLOW_ERROR_TYPE_ACTION,
2905 NULL,
2906 "cannot get counter"
2907 " context.");
2908 }
2909 return 0;
2910 }
2911
2912 /**
2913 * Convert VXLAN VNI to 32-bit integer.
2914 *
2915 * @param[in] vni
2916 * VXLAN VNI in 24-bit wire format.
2917 *
2918 * @return
2919 * VXLAN VNI as a 32-bit integer value in network endianness.
2920 */
2921 static inline rte_be32_t
2922 vxlan_vni_as_be32(const uint8_t vni[3])
2923 {
2924 union {
2925 uint8_t vni[4];
2926 rte_be32_t dword;
2927 } ret = {
2928 .vni = { 0, vni[0], vni[1], vni[2] },
2929 };
2930 return ret.dword;
2931 }
2932
2933 /**
2934 * Helper function to process RTE_FLOW_ITEM_TYPE_ETH entry in configuration
2935 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the MAC address fields
2936 * in the encapsulation parameters structure. The item must be prevalidated,
2937 * no any validation checks performed by function.
2938 *
2939 * @param[in] spec
2940 * RTE_FLOW_ITEM_TYPE_ETH entry specification.
2941 * @param[in] mask
2942 * RTE_FLOW_ITEM_TYPE_ETH entry mask.
2943 * @param[out] encap
2944 * Structure to fill the gathered MAC address data.
2945 */
2946 static void
2947 flow_tcf_parse_vxlan_encap_eth(const struct rte_flow_item_eth *spec,
2948 const struct rte_flow_item_eth *mask,
2949 struct flow_tcf_vxlan_encap *encap)
2950 {
2951 /* Item must be validated before. No redundant checks. */
2952 assert(spec);
2953 if (!mask || !memcmp(&mask->dst,
2954 &rte_flow_item_eth_mask.dst,
2955 sizeof(rte_flow_item_eth_mask.dst))) {
2956 /*
2957 * Ethernet addresses are not supported by
2958 * tc as tunnel_key parameters. Destination
2959 * address is needed to form encap packet
2960 * header and retrieved by kernel from
2961 * implicit sources (ARP table, etc),
2962 * address masks are not supported at all.
2963 */
2964 encap->eth.dst = spec->dst;
2965 encap->mask |= FLOW_TCF_ENCAP_ETH_DST;
2966 }
2967 if (!mask || !memcmp(&mask->src,
2968 &rte_flow_item_eth_mask.src,
2969 sizeof(rte_flow_item_eth_mask.src))) {
2970 /*
2971 * Ethernet addresses are not supported by
2972 * tc as tunnel_key parameters. Source ethernet
2973 * address is ignored anyway.
2974 */
2975 encap->eth.src = spec->src;
2976 encap->mask |= FLOW_TCF_ENCAP_ETH_SRC;
2977 }
2978 }
2979
2980 /**
2981 * Helper function to process RTE_FLOW_ITEM_TYPE_IPV4 entry in configuration
2982 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV4 address fields
2983 * in the encapsulation parameters structure. The item must be prevalidated,
2984 * no any validation checks performed by function.
2985 *
2986 * @param[in] spec
2987 * RTE_FLOW_ITEM_TYPE_IPV4 entry specification.
2988 * @param[in] mask
2989 * RTE_FLOW_ITEM_TYPE_IPV4 entry mask.
2990 * @param[out] encap
2991 * Structure to fill the gathered IPV4 address data.
2992 */
2993 static void
2994 flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec,
2995 const struct rte_flow_item_ipv4 *mask,
2996 struct flow_tcf_vxlan_encap *encap)
2997 {
2998 /* Item must be validated before. No redundant checks. */
2999 assert(spec);
3000 encap->ipv4.dst = spec->hdr.dst_addr;
3001 encap->ipv4.src = spec->hdr.src_addr;
3002 encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC |
3003 FLOW_TCF_ENCAP_IPV4_DST;
3004 if (mask && mask->hdr.type_of_service) {
3005 encap->mask |= FLOW_TCF_ENCAP_IP_TOS;
3006 encap->ip_tos = spec->hdr.type_of_service;
3007 }
3008 if (mask && mask->hdr.time_to_live) {
3009 encap->mask |= FLOW_TCF_ENCAP_IP_TTL;
3010 encap->ip_ttl_hop = spec->hdr.time_to_live;
3011 }
3012 }
3013
3014 /**
3015 * Helper function to process RTE_FLOW_ITEM_TYPE_IPV6 entry in configuration
3016 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV6 address fields
3017 * in the encapsulation parameters structure. The item must be prevalidated,
3018 * no any validation checks performed by function.
3019 *
3020 * @param[in] spec
3021 * RTE_FLOW_ITEM_TYPE_IPV6 entry specification.
3022 * @param[in] mask
3023 * RTE_FLOW_ITEM_TYPE_IPV6 entry mask.
3024 * @param[out] encap
3025 * Structure to fill the gathered IPV6 address data.
3026 */
3027 static void
3028 flow_tcf_parse_vxlan_encap_ipv6(const struct rte_flow_item_ipv6 *spec,
3029 const struct rte_flow_item_ipv6 *mask,
3030 struct flow_tcf_vxlan_encap *encap)
3031 {
3032 /* Item must be validated before. No redundant checks. */
3033 assert(spec);
3034 memcpy(encap->ipv6.dst, spec->hdr.dst_addr, IPV6_ADDR_LEN);
3035 memcpy(encap->ipv6.src, spec->hdr.src_addr, IPV6_ADDR_LEN);
3036 encap->mask |= FLOW_TCF_ENCAP_IPV6_SRC |
3037 FLOW_TCF_ENCAP_IPV6_DST;
3038 if (mask) {
3039 if ((rte_be_to_cpu_32(mask->hdr.vtc_flow) >>
3040 IPV6_HDR_TC_SHIFT) & 0xff) {
3041 encap->mask |= FLOW_TCF_ENCAP_IP_TOS;
3042 encap->ip_tos = (rte_be_to_cpu_32
3043 (spec->hdr.vtc_flow) >>
3044 IPV6_HDR_TC_SHIFT) & 0xff;
3045 }
3046 if (mask->hdr.hop_limits) {
3047 encap->mask |= FLOW_TCF_ENCAP_IP_TTL;
3048 encap->ip_ttl_hop = spec->hdr.hop_limits;
3049 }
3050 }
3051 }
3052
3053 /**
3054 * Helper function to process RTE_FLOW_ITEM_TYPE_UDP entry in configuration
3055 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the UDP port fields
3056 * in the encapsulation parameters structure. The item must be prevalidated,
3057 * no any validation checks performed by function.
3058 *
3059 * @param[in] spec
3060 * RTE_FLOW_ITEM_TYPE_UDP entry specification.
3061 * @param[in] mask
3062 * RTE_FLOW_ITEM_TYPE_UDP entry mask.
3063 * @param[out] encap
3064 * Structure to fill the gathered UDP port data.
3065 */
3066 static void
3067 flow_tcf_parse_vxlan_encap_udp(const struct rte_flow_item_udp *spec,
3068 const struct rte_flow_item_udp *mask,
3069 struct flow_tcf_vxlan_encap *encap)
3070 {
3071 assert(spec);
3072 encap->udp.dst = spec->hdr.dst_port;
3073 encap->mask |= FLOW_TCF_ENCAP_UDP_DST;
3074 if (!mask || mask->hdr.src_port != RTE_BE16(0x0000)) {
3075 encap->udp.src = spec->hdr.src_port;
3076 encap->mask |= FLOW_TCF_ENCAP_IPV4_SRC;
3077 }
3078 }
3079
3080 /**
3081 * Helper function to process RTE_FLOW_ITEM_TYPE_VXLAN entry in configuration
3082 * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the VNI fields
3083 * in the encapsulation parameters structure. The item must be prevalidated,
3084 * no any validation checks performed by function.
3085 *
3086 * @param[in] spec
3087 * RTE_FLOW_ITEM_TYPE_VXLAN entry specification.
3088 * @param[out] encap
3089 * Structure to fill the gathered VNI address data.
3090 */
3091 static void
3092 flow_tcf_parse_vxlan_encap_vni(const struct rte_flow_item_vxlan *spec,
3093 struct flow_tcf_vxlan_encap *encap)
3094 {
3095 /* Item must be validated before. Do not redundant checks. */
3096 assert(spec);
3097 memcpy(encap->vxlan.vni, spec->vni, sizeof(encap->vxlan.vni));
3098 encap->mask |= FLOW_TCF_ENCAP_VXLAN_VNI;
3099 }
3100
3101 /**
3102 * Populate consolidated encapsulation object from list of pattern items.
3103 *
3104 * Helper function to process configuration of action such as
3105 * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. The item list should be
3106 * validated, there is no way to return an meaningful error.
3107 *
3108 * @param[in] action
3109 * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
3110 * List of pattern items to gather data from.
3111 * @param[out] src
3112 * Structure to fill gathered data.
3113 */
3114 static void
3115 flow_tcf_vxlan_encap_parse(const struct rte_flow_action *action,
3116 struct flow_tcf_vxlan_encap *encap)
3117 {
3118 union {
3119 const struct rte_flow_item_eth *eth;
3120 const struct rte_flow_item_ipv4 *ipv4;
3121 const struct rte_flow_item_ipv6 *ipv6;
3122 const struct rte_flow_item_udp *udp;
3123 const struct rte_flow_item_vxlan *vxlan;
3124 } spec, mask;
3125 const struct rte_flow_item *items;
3126
3127 assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
3128 assert(action->conf);
3129
3130 items = ((const struct rte_flow_action_vxlan_encap *)
3131 action->conf)->definition;
3132 assert(items);
3133 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3134 switch (items->type) {
3135 case RTE_FLOW_ITEM_TYPE_VOID:
3136 break;
3137 case RTE_FLOW_ITEM_TYPE_ETH:
3138 mask.eth = items->mask;
3139 spec.eth = items->spec;
3140 flow_tcf_parse_vxlan_encap_eth(spec.eth, mask.eth,
3141 encap);
3142 break;
3143 case RTE_FLOW_ITEM_TYPE_IPV4:
3144 spec.ipv4 = items->spec;
3145 mask.ipv4 = items->mask;
3146 flow_tcf_parse_vxlan_encap_ipv4(spec.ipv4, mask.ipv4,
3147 encap);
3148 break;
3149 case RTE_FLOW_ITEM_TYPE_IPV6:
3150 spec.ipv6 = items->spec;
3151 mask.ipv6 = items->mask;
3152 flow_tcf_parse_vxlan_encap_ipv6(spec.ipv6, mask.ipv6,
3153 encap);
3154 break;
3155 case RTE_FLOW_ITEM_TYPE_UDP:
3156 mask.udp = items->mask;
3157 spec.udp = items->spec;
3158 flow_tcf_parse_vxlan_encap_udp(spec.udp, mask.udp,
3159 encap);
3160 break;
3161 case RTE_FLOW_ITEM_TYPE_VXLAN:
3162 spec.vxlan = items->spec;
3163 flow_tcf_parse_vxlan_encap_vni(spec.vxlan, encap);
3164 break;
3165 default:
3166 assert(false);
3167 DRV_LOG(WARNING,
3168 "unsupported item %p type %d,"
3169 " items must be validated"
3170 " before flow creation",
3171 (const void *)items, items->type);
3172 encap->mask = 0;
3173 return;
3174 }
3175 }
3176 }
3177
3178 /**
3179 * Translate flow for Linux TC flower and construct Netlink message.
3180 *
3181 * @param[in] priv
3182 * Pointer to the priv structure.
3183 * @param[in, out] flow
3184 * Pointer to the sub flow.
3185 * @param[in] attr
3186 * Pointer to the flow attributes.
3187 * @param[in] items
3188 * Pointer to the list of items.
3189 * @param[in] actions
3190 * Pointer to the list of actions.
3191 * @param[out] error
3192 * Pointer to the error structure.
3193 *
3194 * @return
3195 * 0 on success, a negative errno value otherwise and rte_errno is set.
3196 */
3197 static int
3198 flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow,
3199 const struct rte_flow_attr *attr,
3200 const struct rte_flow_item items[],
3201 const struct rte_flow_action actions[],
3202 struct rte_flow_error *error)
3203 {
3204 union {
3205 const struct rte_flow_item_port_id *port_id;
3206 const struct rte_flow_item_eth *eth;
3207 const struct rte_flow_item_vlan *vlan;
3208 const struct rte_flow_item_ipv4 *ipv4;
3209 const struct rte_flow_item_ipv6 *ipv6;
3210 const struct rte_flow_item_tcp *tcp;
3211 const struct rte_flow_item_udp *udp;
3212 const struct rte_flow_item_vxlan *vxlan;
3213 } spec, mask;
3214 union {
3215 const struct rte_flow_action_port_id *port_id;
3216 const struct rte_flow_action_jump *jump;
3217 const struct rte_flow_action_of_push_vlan *of_push_vlan;
3218 const struct rte_flow_action_of_set_vlan_vid *
3219 of_set_vlan_vid;
3220 const struct rte_flow_action_of_set_vlan_pcp *
3221 of_set_vlan_pcp;
3222 } conf;
3223 union {
3224 struct flow_tcf_tunnel_hdr *hdr;
3225 struct flow_tcf_vxlan_decap *vxlan;
3226 } decap = {
3227 .hdr = NULL,
3228 };
3229 union {
3230 struct flow_tcf_tunnel_hdr *hdr;
3231 struct flow_tcf_vxlan_encap *vxlan;
3232 } encap = {
3233 .hdr = NULL,
3234 };
3235 struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
3236 struct nlmsghdr *nlh = dev_flow->tcf.nlh;
3237 struct tcmsg *tcm = dev_flow->tcf.tcm;
3238 uint32_t na_act_index_cur;
3239 rte_be16_t inner_etype = RTE_BE16(ETH_P_ALL);
3240 rte_be16_t outer_etype = RTE_BE16(ETH_P_ALL);
3241 rte_be16_t vlan_etype = RTE_BE16(ETH_P_ALL);
3242 bool ip_proto_set = 0;
3243 bool tunnel_outer = 0;
3244 struct nlattr *na_flower;
3245 struct nlattr *na_flower_act;
3246 struct nlattr *na_vlan_id = NULL;
3247 struct nlattr *na_vlan_priority = NULL;
3248 uint64_t item_flags = 0;
3249 int ret;
3250
3251 claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
3252 PTOI_TABLE_SZ_MAX(dev)));
3253 if (dev_flow->tcf.tunnel) {
3254 switch (dev_flow->tcf.tunnel->type) {
3255 case FLOW_TCF_TUNACT_VXLAN_DECAP:
3256 decap.vxlan = dev_flow->tcf.vxlan_decap;
3257 tunnel_outer = 1;
3258 break;
3259 case FLOW_TCF_TUNACT_VXLAN_ENCAP:
3260 encap.vxlan = dev_flow->tcf.vxlan_encap;
3261 break;
3262 /* New tunnel actions can be added here. */
3263 default:
3264 assert(false);
3265 break;
3266 }
3267 }
3268 nlh = dev_flow->tcf.nlh;
3269 tcm = dev_flow->tcf.tcm;
3270 /* Prepare API must have been called beforehand. */
3271 assert(nlh != NULL && tcm != NULL);
3272 tcm->tcm_family = AF_UNSPEC;
3273 tcm->tcm_ifindex = ptoi[0].ifindex;
3274 tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
3275 /*
3276 * Priority cannot be zero to prevent the kernel from picking one
3277 * automatically.
3278 */
3279 tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16, outer_etype);
3280 if (attr->group > 0)
3281 mnl_attr_put_u32(nlh, TCA_CHAIN, attr->group);
3282 mnl_attr_put_strz(nlh, TCA_KIND, "flower");
3283 na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS);
3284 for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
3285 unsigned int i;
3286
3287 switch (items->type) {
3288 case RTE_FLOW_ITEM_TYPE_VOID:
3289 break;
3290 case RTE_FLOW_ITEM_TYPE_PORT_ID:
3291 mask.port_id = flow_tcf_item_mask
3292 (items, &rte_flow_item_port_id_mask,
3293 &flow_tcf_mask_supported.port_id,
3294 &flow_tcf_mask_empty.port_id,
3295 sizeof(flow_tcf_mask_supported.port_id),
3296 error);
3297 assert(mask.port_id);
3298 if (mask.port_id == &flow_tcf_mask_empty.port_id)
3299 break;
3300 spec.port_id = items->spec;
3301 if (!mask.port_id->id)
3302 i = 0;
3303 else
3304 for (i = 0; ptoi[i].ifindex; ++i)
3305 if (ptoi[i].port_id == spec.port_id->id)
3306 break;
3307 assert(ptoi[i].ifindex);
3308 tcm->tcm_ifindex = ptoi[i].ifindex;
3309 break;
3310 case RTE_FLOW_ITEM_TYPE_ETH:
3311 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3312 MLX5_FLOW_LAYER_INNER_L2 :
3313 MLX5_FLOW_LAYER_OUTER_L2;
3314 mask.eth = flow_tcf_item_mask
3315 (items, &rte_flow_item_eth_mask,
3316 &flow_tcf_mask_supported.eth,
3317 &flow_tcf_mask_empty.eth,
3318 sizeof(flow_tcf_mask_supported.eth),
3319 error);
3320 assert(mask.eth);
3321 if (mask.eth == &flow_tcf_mask_empty.eth)
3322 break;
3323 spec.eth = items->spec;
3324 if (mask.eth->type) {
3325 if (item_flags & MLX5_FLOW_LAYER_TUNNEL)
3326 inner_etype = spec.eth->type;
3327 else
3328 outer_etype = spec.eth->type;
3329 }
3330 if (tunnel_outer) {
3331 DRV_LOG(WARNING,
3332 "outer L2 addresses cannot be"
3333 " forced is outer ones for tunnel,"
3334 " parameter is ignored");
3335 break;
3336 }
3337 if (!is_zero_ether_addr(&mask.eth->dst)) {
3338 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST,
3339 ETHER_ADDR_LEN,
3340 spec.eth->dst.addr_bytes);
3341 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST_MASK,
3342 ETHER_ADDR_LEN,
3343 mask.eth->dst.addr_bytes);
3344 }
3345 if (!is_zero_ether_addr(&mask.eth->src)) {
3346 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC,
3347 ETHER_ADDR_LEN,
3348 spec.eth->src.addr_bytes);
3349 mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC_MASK,
3350 ETHER_ADDR_LEN,
3351 mask.eth->src.addr_bytes);
3352 }
3353 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3354 break;
3355 case RTE_FLOW_ITEM_TYPE_VLAN:
3356 assert(!encap.hdr);
3357 assert(!decap.hdr);
3358 assert(!tunnel_outer);
3359 item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
3360 mask.vlan = flow_tcf_item_mask
3361 (items, &rte_flow_item_vlan_mask,
3362 &flow_tcf_mask_supported.vlan,
3363 &flow_tcf_mask_empty.vlan,
3364 sizeof(flow_tcf_mask_supported.vlan),
3365 error);
3366 assert(mask.vlan);
3367 if (mask.vlan == &flow_tcf_mask_empty.vlan)
3368 break;
3369 spec.vlan = items->spec;
3370 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3371 outer_etype == RTE_BE16(ETH_P_8021Q));
3372 outer_etype = RTE_BE16(ETH_P_8021Q);
3373 if (mask.vlan->inner_type)
3374 vlan_etype = spec.vlan->inner_type;
3375 if (mask.vlan->tci & RTE_BE16(0xe000))
3376 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_VLAN_PRIO,
3377 (rte_be_to_cpu_16
3378 (spec.vlan->tci) >> 13) & 0x7);
3379 if (mask.vlan->tci & RTE_BE16(0x0fff))
3380 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_VLAN_ID,
3381 rte_be_to_cpu_16
3382 (spec.vlan->tci &
3383 RTE_BE16(0x0fff)));
3384 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3385 break;
3386 case RTE_FLOW_ITEM_TYPE_IPV4:
3387 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3388 MLX5_FLOW_LAYER_INNER_L3_IPV4 :
3389 MLX5_FLOW_LAYER_OUTER_L3_IPV4;
3390 mask.ipv4 = flow_tcf_item_mask
3391 (items, &rte_flow_item_ipv4_mask,
3392 &flow_tcf_mask_supported.ipv4,
3393 &flow_tcf_mask_empty.ipv4,
3394 sizeof(flow_tcf_mask_supported.ipv4),
3395 error);
3396 assert(mask.ipv4);
3397 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3398 assert(inner_etype == RTE_BE16(ETH_P_ALL) ||
3399 inner_etype == RTE_BE16(ETH_P_IP));
3400 inner_etype = RTE_BE16(ETH_P_IP);
3401 } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) {
3402 assert(vlan_etype == RTE_BE16(ETH_P_ALL) ||
3403 vlan_etype == RTE_BE16(ETH_P_IP));
3404 vlan_etype = RTE_BE16(ETH_P_IP);
3405 } else {
3406 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3407 outer_etype == RTE_BE16(ETH_P_IP));
3408 outer_etype = RTE_BE16(ETH_P_IP);
3409 }
3410 spec.ipv4 = items->spec;
3411 if (!tunnel_outer && mask.ipv4->hdr.next_proto_id) {
3412 /*
3413 * No way to set IP protocol for outer tunnel
3414 * layers. Usually it is fixed, for example,
3415 * to UDP for VXLAN/GPE.
3416 */
3417 assert(spec.ipv4); /* Mask is not empty. */
3418 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3419 spec.ipv4->hdr.next_proto_id);
3420 ip_proto_set = 1;
3421 }
3422 if (mask.ipv4 == &flow_tcf_mask_empty.ipv4 ||
3423 (!mask.ipv4->hdr.src_addr &&
3424 !mask.ipv4->hdr.dst_addr)) {
3425 if (!tunnel_outer)
3426 break;
3427 /*
3428 * For tunnel outer we must set outer IP key
3429 * anyway, even if the specification/mask is
3430 * empty. There is no another way to tell
3431 * kernel about he outer layer protocol.
3432 */
3433 mnl_attr_put_u32
3434 (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC,
3435 mask.ipv4->hdr.src_addr);
3436 mnl_attr_put_u32
3437 (nlh, TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
3438 mask.ipv4->hdr.src_addr);
3439 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3440 break;
3441 }
3442 if (mask.ipv4->hdr.src_addr) {
3443 mnl_attr_put_u32
3444 (nlh, tunnel_outer ?
3445 TCA_FLOWER_KEY_ENC_IPV4_SRC :
3446 TCA_FLOWER_KEY_IPV4_SRC,
3447 spec.ipv4->hdr.src_addr);
3448 mnl_attr_put_u32
3449 (nlh, tunnel_outer ?
3450 TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK :
3451 TCA_FLOWER_KEY_IPV4_SRC_MASK,
3452 mask.ipv4->hdr.src_addr);
3453 }
3454 if (mask.ipv4->hdr.dst_addr) {
3455 mnl_attr_put_u32
3456 (nlh, tunnel_outer ?
3457 TCA_FLOWER_KEY_ENC_IPV4_DST :
3458 TCA_FLOWER_KEY_IPV4_DST,
3459 spec.ipv4->hdr.dst_addr);
3460 mnl_attr_put_u32
3461 (nlh, tunnel_outer ?
3462 TCA_FLOWER_KEY_ENC_IPV4_DST_MASK :
3463 TCA_FLOWER_KEY_IPV4_DST_MASK,
3464 mask.ipv4->hdr.dst_addr);
3465 }
3466 if (mask.ipv4->hdr.time_to_live) {
3467 mnl_attr_put_u8
3468 (nlh, tunnel_outer ?
3469 TCA_FLOWER_KEY_ENC_IP_TTL :
3470 TCA_FLOWER_KEY_IP_TTL,
3471 spec.ipv4->hdr.time_to_live);
3472 mnl_attr_put_u8
3473 (nlh, tunnel_outer ?
3474 TCA_FLOWER_KEY_ENC_IP_TTL_MASK :
3475 TCA_FLOWER_KEY_IP_TTL_MASK,
3476 mask.ipv4->hdr.time_to_live);
3477 }
3478 if (mask.ipv4->hdr.type_of_service) {
3479 mnl_attr_put_u8
3480 (nlh, tunnel_outer ?
3481 TCA_FLOWER_KEY_ENC_IP_TOS :
3482 TCA_FLOWER_KEY_IP_TOS,
3483 spec.ipv4->hdr.type_of_service);
3484 mnl_attr_put_u8
3485 (nlh, tunnel_outer ?
3486 TCA_FLOWER_KEY_ENC_IP_TOS_MASK :
3487 TCA_FLOWER_KEY_IP_TOS_MASK,
3488 mask.ipv4->hdr.type_of_service);
3489 }
3490 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3491 break;
3492 case RTE_FLOW_ITEM_TYPE_IPV6: {
3493 bool ipv6_src, ipv6_dst;
3494 uint8_t msk6, tos6;
3495
3496 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3497 MLX5_FLOW_LAYER_INNER_L3_IPV6 :
3498 MLX5_FLOW_LAYER_OUTER_L3_IPV6;
3499 mask.ipv6 = flow_tcf_item_mask
3500 (items, &rte_flow_item_ipv6_mask,
3501 &flow_tcf_mask_supported.ipv6,
3502 &flow_tcf_mask_empty.ipv6,
3503 sizeof(flow_tcf_mask_supported.ipv6),
3504 error);
3505 assert(mask.ipv6);
3506 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3507 assert(inner_etype == RTE_BE16(ETH_P_ALL) ||
3508 inner_etype == RTE_BE16(ETH_P_IPV6));
3509 inner_etype = RTE_BE16(ETH_P_IPV6);
3510 } else if (outer_etype == RTE_BE16(ETH_P_8021Q)) {
3511 assert(vlan_etype == RTE_BE16(ETH_P_ALL) ||
3512 vlan_etype == RTE_BE16(ETH_P_IPV6));
3513 vlan_etype = RTE_BE16(ETH_P_IPV6);
3514 } else {
3515 assert(outer_etype == RTE_BE16(ETH_P_ALL) ||
3516 outer_etype == RTE_BE16(ETH_P_IPV6));
3517 outer_etype = RTE_BE16(ETH_P_IPV6);
3518 }
3519 spec.ipv6 = items->spec;
3520 if (!tunnel_outer && mask.ipv6->hdr.proto) {
3521 /*
3522 * No way to set IP protocol for outer tunnel
3523 * layers. Usually it is fixed, for example,
3524 * to UDP for VXLAN/GPE.
3525 */
3526 assert(spec.ipv6); /* Mask is not empty. */
3527 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3528 spec.ipv6->hdr.proto);
3529 ip_proto_set = 1;
3530 }
3531 ipv6_dst = !IN6_IS_ADDR_UNSPECIFIED
3532 (mask.ipv6->hdr.dst_addr);
3533 ipv6_src = !IN6_IS_ADDR_UNSPECIFIED
3534 (mask.ipv6->hdr.src_addr);
3535 if (mask.ipv6 == &flow_tcf_mask_empty.ipv6 ||
3536 (!ipv6_dst && !ipv6_src)) {
3537 if (!tunnel_outer)
3538 break;
3539 /*
3540 * For tunnel outer we must set outer IP key
3541 * anyway, even if the specification/mask is
3542 * empty. There is no another way to tell
3543 * kernel about he outer layer protocol.
3544 */
3545 mnl_attr_put(nlh,
3546 TCA_FLOWER_KEY_ENC_IPV6_SRC,
3547 IPV6_ADDR_LEN,
3548 mask.ipv6->hdr.src_addr);
3549 mnl_attr_put(nlh,
3550 TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
3551 IPV6_ADDR_LEN,
3552 mask.ipv6->hdr.src_addr);
3553 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3554 break;
3555 }
3556 if (ipv6_src) {
3557 mnl_attr_put(nlh, tunnel_outer ?
3558 TCA_FLOWER_KEY_ENC_IPV6_SRC :
3559 TCA_FLOWER_KEY_IPV6_SRC,
3560 IPV6_ADDR_LEN,
3561 spec.ipv6->hdr.src_addr);
3562 mnl_attr_put(nlh, tunnel_outer ?
3563 TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK :
3564 TCA_FLOWER_KEY_IPV6_SRC_MASK,
3565 IPV6_ADDR_LEN,
3566 mask.ipv6->hdr.src_addr);
3567 }
3568 if (ipv6_dst) {
3569 mnl_attr_put(nlh, tunnel_outer ?
3570 TCA_FLOWER_KEY_ENC_IPV6_DST :
3571 TCA_FLOWER_KEY_IPV6_DST,
3572 IPV6_ADDR_LEN,
3573 spec.ipv6->hdr.dst_addr);
3574 mnl_attr_put(nlh, tunnel_outer ?
3575 TCA_FLOWER_KEY_ENC_IPV6_DST_MASK :
3576 TCA_FLOWER_KEY_IPV6_DST_MASK,
3577 IPV6_ADDR_LEN,
3578 mask.ipv6->hdr.dst_addr);
3579 }
3580 if (mask.ipv6->hdr.hop_limits) {
3581 mnl_attr_put_u8
3582 (nlh, tunnel_outer ?
3583 TCA_FLOWER_KEY_ENC_IP_TTL :
3584 TCA_FLOWER_KEY_IP_TTL,
3585 spec.ipv6->hdr.hop_limits);
3586 mnl_attr_put_u8
3587 (nlh, tunnel_outer ?
3588 TCA_FLOWER_KEY_ENC_IP_TTL_MASK :
3589 TCA_FLOWER_KEY_IP_TTL_MASK,
3590 mask.ipv6->hdr.hop_limits);
3591 }
3592 msk6 = (rte_be_to_cpu_32(mask.ipv6->hdr.vtc_flow) >>
3593 IPV6_HDR_TC_SHIFT) & 0xff;
3594 if (msk6) {
3595 tos6 = (rte_be_to_cpu_32
3596 (spec.ipv6->hdr.vtc_flow) >>
3597 IPV6_HDR_TC_SHIFT) & 0xff;
3598 mnl_attr_put_u8
3599 (nlh, tunnel_outer ?
3600 TCA_FLOWER_KEY_ENC_IP_TOS :
3601 TCA_FLOWER_KEY_IP_TOS, tos6);
3602 mnl_attr_put_u8
3603 (nlh, tunnel_outer ?
3604 TCA_FLOWER_KEY_ENC_IP_TOS_MASK :
3605 TCA_FLOWER_KEY_IP_TOS_MASK, msk6);
3606 }
3607 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3608 break;
3609 }
3610 case RTE_FLOW_ITEM_TYPE_UDP:
3611 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3612 MLX5_FLOW_LAYER_INNER_L4_UDP :
3613 MLX5_FLOW_LAYER_OUTER_L4_UDP;
3614 mask.udp = flow_tcf_item_mask
3615 (items, &rte_flow_item_udp_mask,
3616 &flow_tcf_mask_supported.udp,
3617 &flow_tcf_mask_empty.udp,
3618 sizeof(flow_tcf_mask_supported.udp),
3619 error);
3620 assert(mask.udp);
3621 spec.udp = items->spec;
3622 if (!tunnel_outer) {
3623 if (!ip_proto_set)
3624 mnl_attr_put_u8
3625 (nlh, TCA_FLOWER_KEY_IP_PROTO,
3626 IPPROTO_UDP);
3627 if (mask.udp == &flow_tcf_mask_empty.udp)
3628 break;
3629 } else {
3630 assert(mask.udp != &flow_tcf_mask_empty.udp);
3631 decap.vxlan->udp_port =
3632 rte_be_to_cpu_16
3633 (spec.udp->hdr.dst_port);
3634 }
3635 if (mask.udp->hdr.src_port) {
3636 mnl_attr_put_u16
3637 (nlh, tunnel_outer ?
3638 TCA_FLOWER_KEY_ENC_UDP_SRC_PORT :
3639 TCA_FLOWER_KEY_UDP_SRC,
3640 spec.udp->hdr.src_port);
3641 mnl_attr_put_u16
3642 (nlh, tunnel_outer ?
3643 TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK :
3644 TCA_FLOWER_KEY_UDP_SRC_MASK,
3645 mask.udp->hdr.src_port);
3646 }
3647 if (mask.udp->hdr.dst_port) {
3648 mnl_attr_put_u16
3649 (nlh, tunnel_outer ?
3650 TCA_FLOWER_KEY_ENC_UDP_DST_PORT :
3651 TCA_FLOWER_KEY_UDP_DST,
3652 spec.udp->hdr.dst_port);
3653 mnl_attr_put_u16
3654 (nlh, tunnel_outer ?
3655 TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK :
3656 TCA_FLOWER_KEY_UDP_DST_MASK,
3657 mask.udp->hdr.dst_port);
3658 }
3659 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3660 break;
3661 case RTE_FLOW_ITEM_TYPE_TCP:
3662 item_flags |= (item_flags & MLX5_FLOW_LAYER_TUNNEL) ?
3663 MLX5_FLOW_LAYER_INNER_L4_TCP :
3664 MLX5_FLOW_LAYER_OUTER_L4_TCP;
3665 mask.tcp = flow_tcf_item_mask
3666 (items, &rte_flow_item_tcp_mask,
3667 &flow_tcf_mask_supported.tcp,
3668 &flow_tcf_mask_empty.tcp,
3669 sizeof(flow_tcf_mask_supported.tcp),
3670 error);
3671 assert(mask.tcp);
3672 if (!ip_proto_set)
3673 mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
3674 IPPROTO_TCP);
3675 if (mask.tcp == &flow_tcf_mask_empty.tcp)
3676 break;
3677 spec.tcp = items->spec;
3678 if (mask.tcp->hdr.src_port) {
3679 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_SRC,
3680 spec.tcp->hdr.src_port);
3681 mnl_attr_put_u16(nlh,
3682 TCA_FLOWER_KEY_TCP_SRC_MASK,
3683 mask.tcp->hdr.src_port);
3684 }
3685 if (mask.tcp->hdr.dst_port) {
3686 mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_DST,
3687 spec.tcp->hdr.dst_port);
3688 mnl_attr_put_u16(nlh,
3689 TCA_FLOWER_KEY_TCP_DST_MASK,
3690 mask.tcp->hdr.dst_port);
3691 }
3692 if (mask.tcp->hdr.tcp_flags) {
3693 mnl_attr_put_u16
3694 (nlh,
3695 TCA_FLOWER_KEY_TCP_FLAGS,
3696 rte_cpu_to_be_16
3697 (spec.tcp->hdr.tcp_flags));
3698 mnl_attr_put_u16
3699 (nlh,
3700 TCA_FLOWER_KEY_TCP_FLAGS_MASK,
3701 rte_cpu_to_be_16
3702 (mask.tcp->hdr.tcp_flags));
3703 }
3704 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3705 break;
3706 case RTE_FLOW_ITEM_TYPE_VXLAN:
3707 assert(decap.vxlan);
3708 tunnel_outer = 0;
3709 item_flags |= MLX5_FLOW_LAYER_VXLAN;
3710 spec.vxlan = items->spec;
3711 mnl_attr_put_u32(nlh,
3712 TCA_FLOWER_KEY_ENC_KEY_ID,
3713 vxlan_vni_as_be32(spec.vxlan->vni));
3714 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3715 break;
3716 default:
3717 return rte_flow_error_set(error, ENOTSUP,
3718 RTE_FLOW_ERROR_TYPE_ITEM,
3719 NULL, "item not supported");
3720 }
3721 }
3722 /*
3723 * Set the ether_type flower key and tc rule protocol:
3724 * - if there is nor VLAN neither VXLAN the key is taken from
3725 * eth item directly or deduced from L3 items.
3726 * - if there is vlan item then key is fixed to 802.1q.
3727 * - if there is vxlan item then key is set to inner tunnel type.
3728 * - simultaneous vlan and vxlan items are prohibited.
3729 */
3730 if (outer_etype != RTE_BE16(ETH_P_ALL)) {
3731 tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
3732 outer_etype);
3733 if (item_flags & MLX5_FLOW_LAYER_TUNNEL) {
3734 if (inner_etype != RTE_BE16(ETH_P_ALL))
3735 mnl_attr_put_u16(nlh,
3736 TCA_FLOWER_KEY_ETH_TYPE,
3737 inner_etype);
3738 } else {
3739 mnl_attr_put_u16(nlh,
3740 TCA_FLOWER_KEY_ETH_TYPE,
3741 outer_etype);
3742 if (outer_etype == RTE_BE16(ETH_P_8021Q) &&
3743 vlan_etype != RTE_BE16(ETH_P_ALL))
3744 mnl_attr_put_u16(nlh,
3745 TCA_FLOWER_KEY_VLAN_ETH_TYPE,
3746 vlan_etype);
3747 }
3748 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3749 }
3750 na_flower_act = mnl_attr_nest_start(nlh, TCA_FLOWER_ACT);
3751 na_act_index_cur = 1;
3752 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
3753 struct nlattr *na_act_index;
3754 struct nlattr *na_act;
3755 unsigned int vlan_act;
3756 unsigned int i;
3757
3758 switch (actions->type) {
3759 case RTE_FLOW_ACTION_TYPE_VOID:
3760 break;
3761 case RTE_FLOW_ACTION_TYPE_PORT_ID:
3762 conf.port_id = actions->conf;
3763 if (conf.port_id->original)
3764 i = 0;
3765 else
3766 for (i = 0; ptoi[i].ifindex; ++i)
3767 if (ptoi[i].port_id == conf.port_id->id)
3768 break;
3769 assert(ptoi[i].ifindex);
3770 na_act_index =
3771 mnl_attr_nest_start(nlh, na_act_index_cur++);
3772 assert(na_act_index);
3773 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred");
3774 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3775 assert(na_act);
3776 if (encap.hdr) {
3777 assert(dev_flow->tcf.tunnel);
3778 dev_flow->tcf.tunnel->ifindex_ptr =
3779 &((struct tc_mirred *)
3780 mnl_attr_get_payload
3781 (mnl_nlmsg_get_payload_tail
3782 (nlh)))->ifindex;
3783 } else if (decap.hdr) {
3784 assert(dev_flow->tcf.tunnel);
3785 dev_flow->tcf.tunnel->ifindex_ptr =
3786 (unsigned int *)&tcm->tcm_ifindex;
3787 }
3788 mnl_attr_put(nlh, TCA_MIRRED_PARMS,
3789 sizeof(struct tc_mirred),
3790 &(struct tc_mirred){
3791 .action = TC_ACT_STOLEN,
3792 .eaction = TCA_EGRESS_REDIR,
3793 .ifindex = ptoi[i].ifindex,
3794 });
3795 mnl_attr_nest_end(nlh, na_act);
3796 mnl_attr_nest_end(nlh, na_act_index);
3797 break;
3798 case RTE_FLOW_ACTION_TYPE_JUMP:
3799 conf.jump = actions->conf;
3800 na_act_index =
3801 mnl_attr_nest_start(nlh, na_act_index_cur++);
3802 assert(na_act_index);
3803 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3804 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3805 assert(na_act);
3806 mnl_attr_put(nlh, TCA_GACT_PARMS,
3807 sizeof(struct tc_gact),
3808 &(struct tc_gact){
3809 .action = TC_ACT_GOTO_CHAIN |
3810 conf.jump->group,
3811 });
3812 mnl_attr_nest_end(nlh, na_act);
3813 mnl_attr_nest_end(nlh, na_act_index);
3814 break;
3815 case RTE_FLOW_ACTION_TYPE_DROP:
3816 na_act_index =
3817 mnl_attr_nest_start(nlh, na_act_index_cur++);
3818 assert(na_act_index);
3819 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
3820 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3821 assert(na_act);
3822 mnl_attr_put(nlh, TCA_GACT_PARMS,
3823 sizeof(struct tc_gact),
3824 &(struct tc_gact){
3825 .action = TC_ACT_SHOT,
3826 });
3827 mnl_attr_nest_end(nlh, na_act);
3828 mnl_attr_nest_end(nlh, na_act_index);
3829 break;
3830 case RTE_FLOW_ACTION_TYPE_COUNT:
3831 /*
3832 * Driver adds the count action implicitly for
3833 * each rule it creates.
3834 */
3835 ret = flow_tcf_translate_action_count(dev,
3836 dev_flow, error);
3837 if (ret < 0)
3838 return ret;
3839 break;
3840 case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
3841 conf.of_push_vlan = NULL;
3842 vlan_act = TCA_VLAN_ACT_POP;
3843 goto action_of_vlan;
3844 case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
3845 conf.of_push_vlan = actions->conf;
3846 vlan_act = TCA_VLAN_ACT_PUSH;
3847 goto action_of_vlan;
3848 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
3849 conf.of_set_vlan_vid = actions->conf;
3850 if (na_vlan_id)
3851 goto override_na_vlan_id;
3852 vlan_act = TCA_VLAN_ACT_MODIFY;
3853 goto action_of_vlan;
3854 case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
3855 conf.of_set_vlan_pcp = actions->conf;
3856 if (na_vlan_priority)
3857 goto override_na_vlan_priority;
3858 vlan_act = TCA_VLAN_ACT_MODIFY;
3859 goto action_of_vlan;
3860 action_of_vlan:
3861 na_act_index =
3862 mnl_attr_nest_start(nlh, na_act_index_cur++);
3863 assert(na_act_index);
3864 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "vlan");
3865 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3866 assert(na_act);
3867 mnl_attr_put(nlh, TCA_VLAN_PARMS,
3868 sizeof(struct tc_vlan),
3869 &(struct tc_vlan){
3870 .action = TC_ACT_PIPE,
3871 .v_action = vlan_act,
3872 });
3873 if (vlan_act == TCA_VLAN_ACT_POP) {
3874 mnl_attr_nest_end(nlh, na_act);
3875 mnl_attr_nest_end(nlh, na_act_index);
3876 break;
3877 }
3878 if (vlan_act == TCA_VLAN_ACT_PUSH)
3879 mnl_attr_put_u16(nlh,
3880 TCA_VLAN_PUSH_VLAN_PROTOCOL,
3881 conf.of_push_vlan->ethertype);
3882 na_vlan_id = mnl_nlmsg_get_payload_tail(nlh);
3883 mnl_attr_put_u16(nlh, TCA_VLAN_PAD, 0);
3884 na_vlan_priority = mnl_nlmsg_get_payload_tail(nlh);
3885 mnl_attr_put_u8(nlh, TCA_VLAN_PAD, 0);
3886 mnl_attr_nest_end(nlh, na_act);
3887 mnl_attr_nest_end(nlh, na_act_index);
3888 if (actions->type ==
3889 RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) {
3890 override_na_vlan_id:
3891 na_vlan_id->nla_type = TCA_VLAN_PUSH_VLAN_ID;
3892 *(uint16_t *)mnl_attr_get_payload(na_vlan_id) =
3893 rte_be_to_cpu_16
3894 (conf.of_set_vlan_vid->vlan_vid);
3895 } else if (actions->type ==
3896 RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) {
3897 override_na_vlan_priority:
3898 na_vlan_priority->nla_type =
3899 TCA_VLAN_PUSH_VLAN_PRIORITY;
3900 *(uint8_t *)mnl_attr_get_payload
3901 (na_vlan_priority) =
3902 conf.of_set_vlan_pcp->vlan_pcp;
3903 }
3904 break;
3905 case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
3906 assert(decap.vxlan);
3907 assert(dev_flow->tcf.tunnel);
3908 dev_flow->tcf.tunnel->ifindex_ptr =
3909 (unsigned int *)&tcm->tcm_ifindex;
3910 na_act_index =
3911 mnl_attr_nest_start(nlh, na_act_index_cur++);
3912 assert(na_act_index);
3913 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3914 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3915 assert(na_act);
3916 mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3917 sizeof(struct tc_tunnel_key),
3918 &(struct tc_tunnel_key){
3919 .action = TC_ACT_PIPE,
3920 .t_action = TCA_TUNNEL_KEY_ACT_RELEASE,
3921 });
3922 mnl_attr_nest_end(nlh, na_act);
3923 mnl_attr_nest_end(nlh, na_act_index);
3924 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3925 break;
3926 case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
3927 assert(encap.vxlan);
3928 flow_tcf_vxlan_encap_parse(actions, encap.vxlan);
3929 na_act_index =
3930 mnl_attr_nest_start(nlh, na_act_index_cur++);
3931 assert(na_act_index);
3932 mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
3933 na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
3934 assert(na_act);
3935 mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
3936 sizeof(struct tc_tunnel_key),
3937 &(struct tc_tunnel_key){
3938 .action = TC_ACT_PIPE,
3939 .t_action = TCA_TUNNEL_KEY_ACT_SET,
3940 });
3941 if (encap.vxlan->mask & FLOW_TCF_ENCAP_UDP_DST)
3942 mnl_attr_put_u16(nlh,
3943 TCA_TUNNEL_KEY_ENC_DST_PORT,
3944 encap.vxlan->udp.dst);
3945 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_SRC)
3946 mnl_attr_put_u32(nlh,
3947 TCA_TUNNEL_KEY_ENC_IPV4_SRC,
3948 encap.vxlan->ipv4.src);
3949 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV4_DST)
3950 mnl_attr_put_u32(nlh,
3951 TCA_TUNNEL_KEY_ENC_IPV4_DST,
3952 encap.vxlan->ipv4.dst);
3953 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_SRC)
3954 mnl_attr_put(nlh,
3955 TCA_TUNNEL_KEY_ENC_IPV6_SRC,
3956 sizeof(encap.vxlan->ipv6.src),
3957 &encap.vxlan->ipv6.src);
3958 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IPV6_DST)
3959 mnl_attr_put(nlh,
3960 TCA_TUNNEL_KEY_ENC_IPV6_DST,
3961 sizeof(encap.vxlan->ipv6.dst),
3962 &encap.vxlan->ipv6.dst);
3963 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IP_TTL)
3964 mnl_attr_put_u8(nlh,
3965 TCA_TUNNEL_KEY_ENC_TTL,
3966 encap.vxlan->ip_ttl_hop);
3967 if (encap.vxlan->mask & FLOW_TCF_ENCAP_IP_TOS)
3968 mnl_attr_put_u8(nlh,
3969 TCA_TUNNEL_KEY_ENC_TOS,
3970 encap.vxlan->ip_tos);
3971 if (encap.vxlan->mask & FLOW_TCF_ENCAP_VXLAN_VNI)
3972 mnl_attr_put_u32(nlh,
3973 TCA_TUNNEL_KEY_ENC_KEY_ID,
3974 vxlan_vni_as_be32
3975 (encap.vxlan->vxlan.vni));
3976 mnl_attr_put_u8(nlh, TCA_TUNNEL_KEY_NO_CSUM, 0);
3977 mnl_attr_nest_end(nlh, na_act);
3978 mnl_attr_nest_end(nlh, na_act_index);
3979 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
3980 break;
3981 case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
3982 case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
3983 case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
3984 case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
3985 case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
3986 case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
3987 case RTE_FLOW_ACTION_TYPE_SET_TTL:
3988 case RTE_FLOW_ACTION_TYPE_DEC_TTL:
3989 case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
3990 case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
3991 na_act_index =
3992 mnl_attr_nest_start(nlh, na_act_index_cur++);
3993 flow_tcf_create_pedit_mnl_msg(nlh,
3994 &actions, item_flags);
3995 mnl_attr_nest_end(nlh, na_act_index);
3996 break;
3997 default:
3998 return rte_flow_error_set(error, ENOTSUP,
3999 RTE_FLOW_ERROR_TYPE_ACTION,
4000 actions,
4001 "action not supported");
4002 }
4003 }
4004 assert(na_flower);
4005 assert(na_flower_act);
4006 mnl_attr_nest_end(nlh, na_flower_act);
4007 dev_flow->tcf.ptc_flags = mnl_attr_get_payload
4008 (mnl_nlmsg_get_payload_tail(nlh));
4009 mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, decap.vxlan ?
4010 0 : TCA_CLS_FLAGS_SKIP_SW);
4011 mnl_attr_nest_end(nlh, na_flower);
4012 if (dev_flow->tcf.tunnel && dev_flow->tcf.tunnel->ifindex_ptr)
4013 dev_flow->tcf.tunnel->ifindex_org =
4014 *dev_flow->tcf.tunnel->ifindex_ptr;
4015 assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
4016 return 0;
4017 }
4018
4019 /**
4020 * Send Netlink message with acknowledgment.
4021 *
4022 * @param tcf
4023 * Flow context to use.
4024 * @param nlh
4025 * Message to send. This function always raises the NLM_F_ACK flag before
4026 * sending.
4027 * @param[in] cb
4028 * Callback handler for received message.
4029 * @param[in] arg
4030 * Context pointer for callback handler.
4031 *
4032 * @return
4033 * 0 on success, a negative errno value otherwise and rte_errno is set.
4034 */
4035 static int
4036 flow_tcf_nl_ack(struct mlx5_flow_tcf_context *tcf,
4037 struct nlmsghdr *nlh,
4038 mnl_cb_t cb, void *arg)
4039 {
4040 unsigned int portid = mnl_socket_get_portid(tcf->nl);
4041 uint32_t seq = tcf->seq++;
4042 int ret, err = 0;
4043
4044 assert(tcf->nl);
4045 assert(tcf->buf);
4046 if (!seq) {
4047 /* seq 0 is reserved for kernel event-driven notifications. */
4048 seq = tcf->seq++;
4049 }
4050 nlh->nlmsg_seq = seq;
4051 nlh->nlmsg_flags |= NLM_F_ACK;
4052 ret = mnl_socket_sendto(tcf->nl, nlh, nlh->nlmsg_len);
4053 if (ret <= 0) {
4054 /* Message send error occurred. */
4055 rte_errno = errno;
4056 return -rte_errno;
4057 }
4058 nlh = (struct nlmsghdr *)(tcf->buf);
4059 /*
4060 * The following loop postpones non-fatal errors until multipart
4061 * messages are complete.
4062 */
4063 while (true) {
4064 ret = mnl_socket_recvfrom(tcf->nl, tcf->buf, tcf->buf_size);
4065 if (ret < 0) {
4066 err = errno;
4067 /*
4068 * In case of overflow Will receive till
4069 * end of multipart message. We may lost part
4070 * of reply messages but mark and return an error.
4071 */
4072 if (err != ENOSPC ||
4073 !(nlh->nlmsg_flags & NLM_F_MULTI) ||
4074 nlh->nlmsg_type == NLMSG_DONE)
4075 break;
4076 } else {
4077 ret = mnl_cb_run(nlh, ret, seq, portid, cb, arg);
4078 if (!ret) {
4079 /*
4080 * libmnl returns 0 if DONE or
4081 * success ACK message found.
4082 */
4083 break;
4084 }
4085 if (ret < 0) {
4086 /*
4087 * ACK message with error found
4088 * or some error occurred.
4089 */
4090 err = errno;
4091 break;
4092 }
4093 /* We should continue receiving. */
4094 }
4095 }
4096 if (!err)
4097 return 0;
4098 rte_errno = err;
4099 return -err;
4100 }
4101
4102 #define MNL_BUF_EXTRA_SPACE 16
4103 #define MNL_REQUEST_SIZE_MIN 256
4104 #define MNL_REQUEST_SIZE_MAX 2048
4105 #define MNL_REQUEST_SIZE RTE_MIN(RTE_MAX(sysconf(_SC_PAGESIZE), \
4106 MNL_REQUEST_SIZE_MIN), MNL_REQUEST_SIZE_MAX)
4107
4108 /* Data structures used by flow_tcf_xxx_cb() routines. */
4109 struct tcf_nlcb_buf {
4110 LIST_ENTRY(tcf_nlcb_buf) next;
4111 uint32_t size;
4112 alignas(struct nlmsghdr)
4113 uint8_t msg[]; /**< Netlink message data. */
4114 };
4115
4116 struct tcf_nlcb_context {
4117 unsigned int ifindex; /**< Base interface index. */
4118 uint32_t bufsize;
4119 LIST_HEAD(, tcf_nlcb_buf) nlbuf;
4120 };
4121
4122 /**
4123 * Allocate space for netlink command in buffer list
4124 *
4125 * @param[in, out] ctx
4126 * Pointer to callback context with command buffers list.
4127 * @param[in] size
4128 * Required size of data buffer to be allocated.
4129 *
4130 * @return
4131 * Pointer to allocated memory, aligned as message header.
4132 * NULL if some error occurred.
4133 */
4134 static struct nlmsghdr *
4135 flow_tcf_alloc_nlcmd(struct tcf_nlcb_context *ctx, uint32_t size)
4136 {
4137 struct tcf_nlcb_buf *buf;
4138 struct nlmsghdr *nlh;
4139
4140 size = NLMSG_ALIGN(size);
4141 buf = LIST_FIRST(&ctx->nlbuf);
4142 if (buf && (buf->size + size) <= ctx->bufsize) {
4143 nlh = (struct nlmsghdr *)&buf->msg[buf->size];
4144 buf->size += size;
4145 return nlh;
4146 }
4147 if (size > ctx->bufsize) {
4148 DRV_LOG(WARNING, "netlink: too long command buffer requested");
4149 return NULL;
4150 }
4151 buf = rte_malloc(__func__,
4152 ctx->bufsize + sizeof(struct tcf_nlcb_buf),
4153 alignof(struct tcf_nlcb_buf));
4154 if (!buf) {
4155 DRV_LOG(WARNING, "netlink: no memory for command buffer");
4156 return NULL;
4157 }
4158 LIST_INSERT_HEAD(&ctx->nlbuf, buf, next);
4159 buf->size = size;
4160 nlh = (struct nlmsghdr *)&buf->msg[0];
4161 return nlh;
4162 }
4163
4164 /**
4165 * Send the buffers with prepared netlink commands. Scans the list and
4166 * sends all found buffers. Buffers are sent and freed anyway in order
4167 * to prevent memory leakage if some every message in received packet.
4168 *
4169 * @param[in] tcf
4170 * Context object initialized by mlx5_flow_tcf_context_create().
4171 * @param[in, out] ctx
4172 * Pointer to callback context with command buffers list.
4173 *
4174 * @return
4175 * Zero value on success, negative errno value otherwise
4176 * and rte_errno is set.
4177 */
4178 static int
4179 flow_tcf_send_nlcmd(struct mlx5_flow_tcf_context *tcf,
4180 struct tcf_nlcb_context *ctx)
4181 {
4182 struct tcf_nlcb_buf *bc = LIST_FIRST(&ctx->nlbuf);
4183 int ret = 0;
4184
4185 while (bc) {
4186 struct tcf_nlcb_buf *bn = LIST_NEXT(bc, next);
4187 struct nlmsghdr *nlh;
4188 uint32_t msg = 0;
4189 int rc;
4190
4191 while (msg < bc->size) {
4192 /*
4193 * Send Netlink commands from buffer in one by one
4194 * fashion. If we send multiple rule deletion commands
4195 * in one Netlink message and some error occurs it may
4196 * cause multiple ACK error messages and break sequence
4197 * numbers of Netlink communication, because we expect
4198 * the only one ACK reply.
4199 */
4200 assert((bc->size - msg) >= sizeof(struct nlmsghdr));
4201 nlh = (struct nlmsghdr *)&bc->msg[msg];
4202 assert((bc->size - msg) >= nlh->nlmsg_len);
4203 msg += nlh->nlmsg_len;
4204 rc = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
4205 if (rc) {
4206 DRV_LOG(WARNING,
4207 "netlink: cleanup error %d", rc);
4208 if (!ret)
4209 ret = rc;
4210 }
4211 }
4212 rte_free(bc);
4213 bc = bn;
4214 }
4215 LIST_INIT(&ctx->nlbuf);
4216 return ret;
4217 }
4218
4219 /**
4220 * Collect local IP address rules with scope link attribute on specified
4221 * network device. This is callback routine called by libmnl mnl_cb_run()
4222 * in loop for every message in received packet.
4223 *
4224 * @param[in] nlh
4225 * Pointer to reply header.
4226 * @param[in, out] arg
4227 * Opaque data pointer for this callback.
4228 *
4229 * @return
4230 * A positive, nonzero value on success, negative errno value otherwise
4231 * and rte_errno is set.
4232 */
4233 static int
4234 flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg)
4235 {
4236 struct tcf_nlcb_context *ctx = arg;
4237 struct nlmsghdr *cmd;
4238 struct ifaddrmsg *ifa;
4239 struct nlattr *na;
4240 struct nlattr *na_local = NULL;
4241 struct nlattr *na_peer = NULL;
4242 unsigned char family;
4243 uint32_t size;
4244
4245 if (nlh->nlmsg_type != RTM_NEWADDR) {
4246 rte_errno = EINVAL;
4247 return -rte_errno;
4248 }
4249 ifa = mnl_nlmsg_get_payload(nlh);
4250 family = ifa->ifa_family;
4251 if (ifa->ifa_index != ctx->ifindex ||
4252 ifa->ifa_scope != RT_SCOPE_LINK ||
4253 !(ifa->ifa_flags & IFA_F_PERMANENT) ||
4254 (family != AF_INET && family != AF_INET6))
4255 return 1;
4256 mnl_attr_for_each(na, nlh, sizeof(*ifa)) {
4257 switch (mnl_attr_get_type(na)) {
4258 case IFA_LOCAL:
4259 na_local = na;
4260 break;
4261 case IFA_ADDRESS:
4262 na_peer = na;
4263 break;
4264 }
4265 if (na_local && na_peer)
4266 break;
4267 }
4268 if (!na_local || !na_peer)
4269 return 1;
4270 /* Local rule found with scope link, permanent and assigned peer. */
4271 size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4272 MNL_ALIGN(sizeof(struct ifaddrmsg)) +
4273 (family == AF_INET6 ? 2 * SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4274 : 2 * SZ_NLATTR_TYPE_OF(uint32_t));
4275 cmd = flow_tcf_alloc_nlcmd(ctx, size);
4276 if (!cmd) {
4277 rte_errno = ENOMEM;
4278 return -rte_errno;
4279 }
4280 cmd = mnl_nlmsg_put_header(cmd);
4281 cmd->nlmsg_type = RTM_DELADDR;
4282 cmd->nlmsg_flags = NLM_F_REQUEST;
4283 ifa = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifa));
4284 ifa->ifa_flags = IFA_F_PERMANENT;
4285 ifa->ifa_scope = RT_SCOPE_LINK;
4286 ifa->ifa_index = ctx->ifindex;
4287 if (family == AF_INET) {
4288 ifa->ifa_family = AF_INET;
4289 ifa->ifa_prefixlen = 32;
4290 mnl_attr_put_u32(cmd, IFA_LOCAL, mnl_attr_get_u32(na_local));
4291 mnl_attr_put_u32(cmd, IFA_ADDRESS, mnl_attr_get_u32(na_peer));
4292 } else {
4293 ifa->ifa_family = AF_INET6;
4294 ifa->ifa_prefixlen = 128;
4295 mnl_attr_put(cmd, IFA_LOCAL, IPV6_ADDR_LEN,
4296 mnl_attr_get_payload(na_local));
4297 mnl_attr_put(cmd, IFA_ADDRESS, IPV6_ADDR_LEN,
4298 mnl_attr_get_payload(na_peer));
4299 }
4300 assert(size == cmd->nlmsg_len);
4301 return 1;
4302 }
4303
4304 /**
4305 * Cleanup the local IP addresses on outer interface.
4306 *
4307 * @param[in] tcf
4308 * Context object initialized by mlx5_flow_tcf_context_create().
4309 * @param[in] ifindex
4310 * Network interface index to perform cleanup.
4311 */
4312 static void
4313 flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf,
4314 unsigned int ifindex)
4315 {
4316 struct nlmsghdr *nlh;
4317 struct ifaddrmsg *ifa;
4318 struct tcf_nlcb_context ctx = {
4319 .ifindex = ifindex,
4320 .bufsize = MNL_REQUEST_SIZE,
4321 .nlbuf = LIST_HEAD_INITIALIZER(),
4322 };
4323 int ret;
4324
4325 assert(ifindex);
4326 /*
4327 * Seek and destroy leftovers of local IP addresses with
4328 * matching properties "scope link".
4329 */
4330 nlh = mnl_nlmsg_put_header(tcf->buf);
4331 nlh->nlmsg_type = RTM_GETADDR;
4332 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4333 ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4334 ifa->ifa_family = AF_UNSPEC;
4335 ifa->ifa_index = ifindex;
4336 ifa->ifa_scope = RT_SCOPE_LINK;
4337 ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_local_cb, &ctx);
4338 if (ret)
4339 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4340 ret = flow_tcf_send_nlcmd(tcf, &ctx);
4341 if (ret)
4342 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4343 }
4344
4345 /**
4346 * Collect neigh permanent rules on specified network device.
4347 * This is callback routine called by libmnl mnl_cb_run() in loop for
4348 * every message in received packet.
4349 *
4350 * @param[in] nlh
4351 * Pointer to reply header.
4352 * @param[in, out] arg
4353 * Opaque data pointer for this callback.
4354 *
4355 * @return
4356 * A positive, nonzero value on success, negative errno value otherwise
4357 * and rte_errno is set.
4358 */
4359 static int
4360 flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg)
4361 {
4362 struct tcf_nlcb_context *ctx = arg;
4363 struct nlmsghdr *cmd;
4364 struct ndmsg *ndm;
4365 struct nlattr *na;
4366 struct nlattr *na_ip = NULL;
4367 struct nlattr *na_mac = NULL;
4368 unsigned char family;
4369 uint32_t size;
4370
4371 if (nlh->nlmsg_type != RTM_NEWNEIGH) {
4372 rte_errno = EINVAL;
4373 return -rte_errno;
4374 }
4375 ndm = mnl_nlmsg_get_payload(nlh);
4376 family = ndm->ndm_family;
4377 if (ndm->ndm_ifindex != (int)ctx->ifindex ||
4378 !(ndm->ndm_state & NUD_PERMANENT) ||
4379 (family != AF_INET && family != AF_INET6))
4380 return 1;
4381 mnl_attr_for_each(na, nlh, sizeof(*ndm)) {
4382 switch (mnl_attr_get_type(na)) {
4383 case NDA_DST:
4384 na_ip = na;
4385 break;
4386 case NDA_LLADDR:
4387 na_mac = na;
4388 break;
4389 }
4390 if (na_mac && na_ip)
4391 break;
4392 }
4393 if (!na_mac || !na_ip)
4394 return 1;
4395 /* Neigh rule with permanent attribute found. */
4396 size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4397 MNL_ALIGN(sizeof(struct ndmsg)) +
4398 SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) +
4399 (family == AF_INET6 ? SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
4400 : SZ_NLATTR_TYPE_OF(uint32_t));
4401 cmd = flow_tcf_alloc_nlcmd(ctx, size);
4402 if (!cmd) {
4403 rte_errno = ENOMEM;
4404 return -rte_errno;
4405 }
4406 cmd = mnl_nlmsg_put_header(cmd);
4407 cmd->nlmsg_type = RTM_DELNEIGH;
4408 cmd->nlmsg_flags = NLM_F_REQUEST;
4409 ndm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ndm));
4410 ndm->ndm_ifindex = ctx->ifindex;
4411 ndm->ndm_state = NUD_PERMANENT;
4412 ndm->ndm_flags = 0;
4413 ndm->ndm_type = 0;
4414 if (family == AF_INET) {
4415 ndm->ndm_family = AF_INET;
4416 mnl_attr_put_u32(cmd, NDA_DST, mnl_attr_get_u32(na_ip));
4417 } else {
4418 ndm->ndm_family = AF_INET6;
4419 mnl_attr_put(cmd, NDA_DST, IPV6_ADDR_LEN,
4420 mnl_attr_get_payload(na_ip));
4421 }
4422 mnl_attr_put(cmd, NDA_LLADDR, ETHER_ADDR_LEN,
4423 mnl_attr_get_payload(na_mac));
4424 assert(size == cmd->nlmsg_len);
4425 return 1;
4426 }
4427
4428 /**
4429 * Cleanup the neigh rules on outer interface.
4430 *
4431 * @param[in] tcf
4432 * Context object initialized by mlx5_flow_tcf_context_create().
4433 * @param[in] ifindex
4434 * Network interface index to perform cleanup.
4435 */
4436 static void
4437 flow_tcf_encap_neigh_cleanup(struct mlx5_flow_tcf_context *tcf,
4438 unsigned int ifindex)
4439 {
4440 struct nlmsghdr *nlh;
4441 struct ndmsg *ndm;
4442 struct tcf_nlcb_context ctx = {
4443 .ifindex = ifindex,
4444 .bufsize = MNL_REQUEST_SIZE,
4445 .nlbuf = LIST_HEAD_INITIALIZER(),
4446 };
4447 int ret;
4448
4449 assert(ifindex);
4450 /* Seek and destroy leftovers of neigh rules. */
4451 nlh = mnl_nlmsg_put_header(tcf->buf);
4452 nlh->nlmsg_type = RTM_GETNEIGH;
4453 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4454 ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4455 ndm->ndm_family = AF_UNSPEC;
4456 ndm->ndm_ifindex = ifindex;
4457 ndm->ndm_state = NUD_PERMANENT;
4458 ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_neigh_cb, &ctx);
4459 if (ret)
4460 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4461 ret = flow_tcf_send_nlcmd(tcf, &ctx);
4462 if (ret)
4463 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4464 }
4465
4466 /**
4467 * Collect indices of VXLAN encap/decap interfaces associated with device.
4468 * This is callback routine called by libmnl mnl_cb_run() in loop for
4469 * every message in received packet.
4470 *
4471 * @param[in] nlh
4472 * Pointer to reply header.
4473 * @param[in, out] arg
4474 * Opaque data pointer for this callback.
4475 *
4476 * @return
4477 * A positive, nonzero value on success, negative errno value otherwise
4478 * and rte_errno is set.
4479 */
4480 static int
4481 flow_tcf_collect_vxlan_cb(const struct nlmsghdr *nlh, void *arg)
4482 {
4483 struct tcf_nlcb_context *ctx = arg;
4484 struct nlmsghdr *cmd;
4485 struct ifinfomsg *ifm;
4486 struct nlattr *na;
4487 struct nlattr *na_info = NULL;
4488 struct nlattr *na_vxlan = NULL;
4489 bool found = false;
4490 unsigned int vxindex;
4491 uint32_t size;
4492
4493 if (nlh->nlmsg_type != RTM_NEWLINK) {
4494 rte_errno = EINVAL;
4495 return -rte_errno;
4496 }
4497 ifm = mnl_nlmsg_get_payload(nlh);
4498 if (!ifm->ifi_index) {
4499 rte_errno = EINVAL;
4500 return -rte_errno;
4501 }
4502 mnl_attr_for_each(na, nlh, sizeof(*ifm))
4503 if (mnl_attr_get_type(na) == IFLA_LINKINFO) {
4504 na_info = na;
4505 break;
4506 }
4507 if (!na_info)
4508 return 1;
4509 mnl_attr_for_each_nested(na, na_info) {
4510 switch (mnl_attr_get_type(na)) {
4511 case IFLA_INFO_KIND:
4512 if (!strncmp("vxlan", mnl_attr_get_str(na),
4513 mnl_attr_get_len(na)))
4514 found = true;
4515 break;
4516 case IFLA_INFO_DATA:
4517 na_vxlan = na;
4518 break;
4519 }
4520 if (found && na_vxlan)
4521 break;
4522 }
4523 if (!found || !na_vxlan)
4524 return 1;
4525 found = false;
4526 mnl_attr_for_each_nested(na, na_vxlan) {
4527 if (mnl_attr_get_type(na) == IFLA_VXLAN_LINK &&
4528 mnl_attr_get_u32(na) == ctx->ifindex) {
4529 found = true;
4530 break;
4531 }
4532 }
4533 if (!found)
4534 return 1;
4535 /* Attached VXLAN device found, store the command to delete. */
4536 vxindex = ifm->ifi_index;
4537 size = MNL_ALIGN(sizeof(struct nlmsghdr)) +
4538 MNL_ALIGN(sizeof(struct ifinfomsg));
4539 cmd = flow_tcf_alloc_nlcmd(ctx, size);
4540 if (!cmd) {
4541 rte_errno = ENOMEM;
4542 return -rte_errno;
4543 }
4544 cmd = mnl_nlmsg_put_header(cmd);
4545 cmd->nlmsg_type = RTM_DELLINK;
4546 cmd->nlmsg_flags = NLM_F_REQUEST;
4547 ifm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifm));
4548 ifm->ifi_family = AF_UNSPEC;
4549 ifm->ifi_index = vxindex;
4550 assert(size == cmd->nlmsg_len);
4551 return 1;
4552 }
4553
4554 /**
4555 * Cleanup the outer interface. Removes all found vxlan devices
4556 * attached to specified index, flushes the neigh and local IP
4557 * database.
4558 *
4559 * @param[in] tcf
4560 * Context object initialized by mlx5_flow_tcf_context_create().
4561 * @param[in] ifindex
4562 * Network inferface index to perform cleanup.
4563 */
4564 static void
4565 flow_tcf_encap_iface_cleanup(struct mlx5_flow_tcf_context *tcf,
4566 unsigned int ifindex)
4567 {
4568 struct nlmsghdr *nlh;
4569 struct ifinfomsg *ifm;
4570 struct tcf_nlcb_context ctx = {
4571 .ifindex = ifindex,
4572 .bufsize = MNL_REQUEST_SIZE,
4573 .nlbuf = LIST_HEAD_INITIALIZER(),
4574 };
4575 int ret;
4576
4577 assert(ifindex);
4578 /*
4579 * Seek and destroy leftover VXLAN encap/decap interfaces with
4580 * matching properties.
4581 */
4582 nlh = mnl_nlmsg_put_header(tcf->buf);
4583 nlh->nlmsg_type = RTM_GETLINK;
4584 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
4585 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
4586 ifm->ifi_family = AF_UNSPEC;
4587 ret = flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_vxlan_cb, &ctx);
4588 if (ret)
4589 DRV_LOG(WARNING, "netlink: query device list error %d", ret);
4590 ret = flow_tcf_send_nlcmd(tcf, &ctx);
4591 if (ret)
4592 DRV_LOG(WARNING, "netlink: device delete error %d", ret);
4593 }
4594
4595 /**
4596 * Emit Netlink message to add/remove local address to the outer device.
4597 * The address being added is visible within the link only (scope link).
4598 *
4599 * Note that an implicit route is maintained by the kernel due to the
4600 * presence of a peer address (IFA_ADDRESS).
4601 *
4602 * These rules are used for encapsulation only and allow to assign
4603 * the outer tunnel source IP address.
4604 *
4605 * @param[in] tcf
4606 * Libmnl socket context object.
4607 * @param[in] encap
4608 * Encapsulation properties (source address and its peer).
4609 * @param[in] ifindex
4610 * Network interface to apply rule.
4611 * @param[in] enable
4612 * Toggle between add and remove.
4613 * @param[out] error
4614 * Perform verbose error reporting if not NULL.
4615 *
4616 * @return
4617 * 0 on success, a negative errno value otherwise and rte_errno is set.
4618 */
4619 static int
4620 flow_tcf_rule_local(struct mlx5_flow_tcf_context *tcf,
4621 const struct flow_tcf_vxlan_encap *encap,
4622 unsigned int ifindex,
4623 bool enable,
4624 struct rte_flow_error *error)
4625 {
4626 struct nlmsghdr *nlh;
4627 struct ifaddrmsg *ifa;
4628 alignas(struct nlmsghdr)
4629 uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)];
4630
4631 nlh = mnl_nlmsg_put_header(buf);
4632 nlh->nlmsg_type = enable ? RTM_NEWADDR : RTM_DELADDR;
4633 nlh->nlmsg_flags =
4634 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4635 nlh->nlmsg_seq = 0;
4636 ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
4637 ifa->ifa_flags = IFA_F_PERMANENT;
4638 ifa->ifa_scope = RT_SCOPE_LINK;
4639 ifa->ifa_index = ifindex;
4640 if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4641 ifa->ifa_family = AF_INET;
4642 ifa->ifa_prefixlen = 32;
4643 mnl_attr_put_u32(nlh, IFA_LOCAL, encap->ipv4.src);
4644 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST)
4645 mnl_attr_put_u32(nlh, IFA_ADDRESS,
4646 encap->ipv4.dst);
4647 } else {
4648 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4649 ifa->ifa_family = AF_INET6;
4650 ifa->ifa_prefixlen = 128;
4651 mnl_attr_put(nlh, IFA_LOCAL,
4652 sizeof(encap->ipv6.src),
4653 &encap->ipv6.src);
4654 if (encap->mask & FLOW_TCF_ENCAP_IPV6_DST)
4655 mnl_attr_put(nlh, IFA_ADDRESS,
4656 sizeof(encap->ipv6.dst),
4657 &encap->ipv6.dst);
4658 }
4659 if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4660 return 0;
4661 return rte_flow_error_set(error, rte_errno,
4662 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4663 "netlink: cannot complete IFA request"
4664 " (ip addr add)");
4665 }
4666
4667 /**
4668 * Emit Netlink message to add/remove neighbor.
4669 *
4670 * @param[in] tcf
4671 * Libmnl socket context object.
4672 * @param[in] encap
4673 * Encapsulation properties (destination address).
4674 * @param[in] ifindex
4675 * Network interface.
4676 * @param[in] enable
4677 * Toggle between add and remove.
4678 * @param[out] error
4679 * Perform verbose error reporting if not NULL.
4680 *
4681 * @return
4682 * 0 on success, a negative errno value otherwise and rte_errno is set.
4683 */
4684 static int
4685 flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf,
4686 const struct flow_tcf_vxlan_encap *encap,
4687 unsigned int ifindex,
4688 bool enable,
4689 struct rte_flow_error *error)
4690 {
4691 struct nlmsghdr *nlh;
4692 struct ndmsg *ndm;
4693 alignas(struct nlmsghdr)
4694 uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)];
4695
4696 nlh = mnl_nlmsg_put_header(buf);
4697 nlh->nlmsg_type = enable ? RTM_NEWNEIGH : RTM_DELNEIGH;
4698 nlh->nlmsg_flags =
4699 NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
4700 nlh->nlmsg_seq = 0;
4701 ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
4702 ndm->ndm_ifindex = ifindex;
4703 ndm->ndm_state = NUD_PERMANENT;
4704 ndm->ndm_flags = 0;
4705 ndm->ndm_type = 0;
4706 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4707 ndm->ndm_family = AF_INET;
4708 mnl_attr_put_u32(nlh, NDA_DST, encap->ipv4.dst);
4709 } else {
4710 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4711 ndm->ndm_family = AF_INET6;
4712 mnl_attr_put(nlh, NDA_DST, sizeof(encap->ipv6.dst),
4713 &encap->ipv6.dst);
4714 }
4715 if (encap->mask & FLOW_TCF_ENCAP_ETH_SRC && enable)
4716 DRV_LOG(WARNING,
4717 "outer ethernet source address cannot be "
4718 "forced for VXLAN encapsulation");
4719 if (encap->mask & FLOW_TCF_ENCAP_ETH_DST)
4720 mnl_attr_put(nlh, NDA_LLADDR, sizeof(encap->eth.dst),
4721 &encap->eth.dst);
4722 if (!flow_tcf_nl_ack(tcf, nlh, NULL, NULL))
4723 return 0;
4724 return rte_flow_error_set(error, rte_errno,
4725 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4726 "netlink: cannot complete ND request"
4727 " (ip neigh)");
4728 }
4729
4730 /**
4731 * Manage the local IP addresses and their peers IP addresses on the
4732 * outer interface for encapsulation purposes. The kernel searches the
4733 * appropriate device for tunnel egress traffic using the outer source
4734 * IP, this IP should be assigned to the outer network device, otherwise
4735 * kernel rejects the rule.
4736 *
4737 * Adds or removes the addresses using the Netlink command like this:
4738 * ip addr add <src_ip> peer <dst_ip> scope link dev <ifouter>
4739 *
4740 * The addresses are local to the netdev ("scope link"), this reduces
4741 * the risk of conflicts. Note that an implicit route is maintained by
4742 * the kernel due to the presence of a peer address (IFA_ADDRESS).
4743 *
4744 * @param[in] tcf
4745 * Libmnl socket context object.
4746 * @param[in] iface
4747 * Object, contains rule database and ifouter index.
4748 * @param[in] dev_flow
4749 * Flow object, contains the tunnel parameters (for encap only).
4750 * @param[in] enable
4751 * Toggle between add and remove.
4752 * @param[out] error
4753 * Perform verbose error reporting if not NULL.
4754 *
4755 * @return
4756 * 0 on success, a negative errno value otherwise and rte_errno is set.
4757 */
4758 static int
4759 flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf,
4760 struct tcf_irule *iface,
4761 struct mlx5_flow *dev_flow,
4762 bool enable,
4763 struct rte_flow_error *error)
4764 {
4765 const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4766 struct tcf_local_rule *rule = NULL;
4767 int ret;
4768
4769 assert(encap);
4770 assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4771 if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4772 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_DST);
4773 LIST_FOREACH(rule, &iface->local, next) {
4774 if (rule->mask & FLOW_TCF_ENCAP_IPV4_SRC &&
4775 encap->ipv4.src == rule->ipv4.src &&
4776 encap->ipv4.dst == rule->ipv4.dst) {
4777 break;
4778 }
4779 }
4780 } else {
4781 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4782 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4783 LIST_FOREACH(rule, &iface->local, next) {
4784 if (rule->mask & FLOW_TCF_ENCAP_IPV6_SRC &&
4785 !memcmp(&encap->ipv6.src, &rule->ipv6.src,
4786 sizeof(encap->ipv6.src)) &&
4787 !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4788 sizeof(encap->ipv6.dst))) {
4789 break;
4790 }
4791 }
4792 }
4793 if (rule) {
4794 if (enable) {
4795 rule->refcnt++;
4796 return 0;
4797 }
4798 if (!rule->refcnt || !--rule->refcnt) {
4799 LIST_REMOVE(rule, next);
4800 return flow_tcf_rule_local(tcf, encap,
4801 iface->ifouter, false, error);
4802 }
4803 return 0;
4804 }
4805 if (!enable) {
4806 DRV_LOG(WARNING, "disabling not existing local rule");
4807 rte_flow_error_set(error, ENOENT,
4808 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4809 "disabling not existing local rule");
4810 return -ENOENT;
4811 }
4812 rule = rte_zmalloc(__func__, sizeof(struct tcf_local_rule),
4813 alignof(struct tcf_local_rule));
4814 if (!rule) {
4815 rte_flow_error_set(error, ENOMEM,
4816 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4817 "unable to allocate memory for local rule");
4818 return -rte_errno;
4819 }
4820 *rule = (struct tcf_local_rule){.refcnt = 0,
4821 .mask = 0,
4822 };
4823 if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
4824 rule->mask = FLOW_TCF_ENCAP_IPV4_SRC
4825 | FLOW_TCF_ENCAP_IPV4_DST;
4826 rule->ipv4.src = encap->ipv4.src;
4827 rule->ipv4.dst = encap->ipv4.dst;
4828 } else {
4829 rule->mask = FLOW_TCF_ENCAP_IPV6_SRC
4830 | FLOW_TCF_ENCAP_IPV6_DST;
4831 memcpy(&rule->ipv6.src, &encap->ipv6.src, IPV6_ADDR_LEN);
4832 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4833 }
4834 ret = flow_tcf_rule_local(tcf, encap, iface->ifouter, true, error);
4835 if (ret) {
4836 rte_free(rule);
4837 return ret;
4838 }
4839 rule->refcnt++;
4840 LIST_INSERT_HEAD(&iface->local, rule, next);
4841 return 0;
4842 }
4843
4844 /**
4845 * Manage the destination MAC/IP addresses neigh database, kernel uses
4846 * this one to determine the destination MAC address within encapsulation
4847 * header. Adds or removes the entries using the Netlink command like this:
4848 * ip neigh add dev <ifouter> lladdr <dst_mac> to <dst_ip> nud permanent
4849 *
4850 * @param[in] tcf
4851 * Libmnl socket context object.
4852 * @param[in] iface
4853 * Object, contains rule database and ifouter index.
4854 * @param[in] dev_flow
4855 * Flow object, contains the tunnel parameters (for encap only).
4856 * @param[in] enable
4857 * Toggle between add and remove.
4858 * @param[out] error
4859 * Perform verbose error reporting if not NULL.
4860 *
4861 * @return
4862 * 0 on success, a negative errno value otherwise and rte_errno is set.
4863 */
4864 static int
4865 flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf,
4866 struct tcf_irule *iface,
4867 struct mlx5_flow *dev_flow,
4868 bool enable,
4869 struct rte_flow_error *error)
4870 {
4871 const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
4872 struct tcf_neigh_rule *rule = NULL;
4873 int ret;
4874
4875 assert(encap);
4876 assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
4877 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4878 assert(encap->mask & FLOW_TCF_ENCAP_IPV4_SRC);
4879 LIST_FOREACH(rule, &iface->neigh, next) {
4880 if (rule->mask & FLOW_TCF_ENCAP_IPV4_DST &&
4881 encap->ipv4.dst == rule->ipv4.dst) {
4882 break;
4883 }
4884 }
4885 } else {
4886 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
4887 assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
4888 LIST_FOREACH(rule, &iface->neigh, next) {
4889 if (rule->mask & FLOW_TCF_ENCAP_IPV6_DST &&
4890 !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
4891 sizeof(encap->ipv6.dst))) {
4892 break;
4893 }
4894 }
4895 }
4896 if (rule) {
4897 if (memcmp(&encap->eth.dst, &rule->eth,
4898 sizeof(encap->eth.dst))) {
4899 DRV_LOG(WARNING, "Destination MAC differs"
4900 " in neigh rule");
4901 rte_flow_error_set(error, EEXIST,
4902 RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
4903 NULL, "Different MAC address"
4904 " neigh rule for the same"
4905 " destination IP");
4906 return -EEXIST;
4907 }
4908 if (enable) {
4909 rule->refcnt++;
4910 return 0;
4911 }
4912 if (!rule->refcnt || !--rule->refcnt) {
4913 LIST_REMOVE(rule, next);
4914 return flow_tcf_rule_neigh(tcf, encap,
4915 iface->ifouter,
4916 false, error);
4917 }
4918 return 0;
4919 }
4920 if (!enable) {
4921 DRV_LOG(WARNING, "Disabling not existing neigh rule");
4922 rte_flow_error_set(error, ENOENT,
4923 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4924 "unable to allocate memory for neigh rule");
4925 return -ENOENT;
4926 }
4927 rule = rte_zmalloc(__func__, sizeof(struct tcf_neigh_rule),
4928 alignof(struct tcf_neigh_rule));
4929 if (!rule) {
4930 rte_flow_error_set(error, ENOMEM,
4931 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
4932 "unable to allocate memory for neigh rule");
4933 return -rte_errno;
4934 }
4935 *rule = (struct tcf_neigh_rule){.refcnt = 0,
4936 .mask = 0,
4937 };
4938 if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
4939 rule->mask = FLOW_TCF_ENCAP_IPV4_DST;
4940 rule->ipv4.dst = encap->ipv4.dst;
4941 } else {
4942 rule->mask = FLOW_TCF_ENCAP_IPV6_DST;
4943 memcpy(&rule->ipv6.dst, &encap->ipv6.dst, IPV6_ADDR_LEN);
4944 }
4945 memcpy(&rule->eth, &encap->eth.dst, sizeof(rule->eth));
4946 ret = flow_tcf_rule_neigh(tcf, encap, iface->ifouter, true, error);
4947 if (ret) {
4948 rte_free(rule);
4949 return ret;
4950 }
4951 rule->refcnt++;
4952 LIST_INSERT_HEAD(&iface->neigh, rule, next);
4953 return 0;
4954 }
4955
4956 /* VXLAN encap rule database for outer interfaces. */
4957 static LIST_HEAD(, tcf_irule) iface_list_vxlan = LIST_HEAD_INITIALIZER();
4958
4959 /* VTEP device list is shared between PMD port instances. */
4960 static LIST_HEAD(, tcf_vtep) vtep_list_vxlan = LIST_HEAD_INITIALIZER();
4961 static pthread_mutex_t vtep_list_mutex = PTHREAD_MUTEX_INITIALIZER;
4962
4963 /**
4964 * Acquire the VXLAN encap rules container for specified interface.
4965 * First looks for the container in the existing ones list, creates
4966 * and initializes the new container if existing not found.
4967 *
4968 * @param[in] tcf
4969 * Context object initialized by mlx5_flow_tcf_context_create().
4970 * @param[in] ifouter
4971 * Network interface index to create VXLAN encap rules on.
4972 * @param[out] error
4973 * Perform verbose error reporting if not NULL.
4974 * @return
4975 * Rule container pointer on success,
4976 * NULL otherwise and rte_errno is set.
4977 */
4978 static struct tcf_irule*
4979 flow_tcf_encap_irule_acquire(struct mlx5_flow_tcf_context *tcf,
4980 unsigned int ifouter,
4981 struct rte_flow_error *error)
4982 {
4983 struct tcf_irule *iface;
4984
4985 /* Look whether the container for encap rules is created. */
4986 assert(ifouter);
4987 LIST_FOREACH(iface, &iface_list_vxlan, next) {
4988 if (iface->ifouter == ifouter)
4989 break;
4990 }
4991 if (iface) {
4992 /* Container already exists, just increment the reference. */
4993 iface->refcnt++;
4994 return iface;
4995 }
4996 /* Not found, we should create the new container. */
4997 iface = rte_zmalloc(__func__, sizeof(*iface),
4998 alignof(struct tcf_irule));
4999 if (!iface) {
5000 rte_flow_error_set(error, ENOMEM,
5001 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5002 "unable to allocate memory for container");
5003 return NULL;
5004 }
5005 *iface = (struct tcf_irule){
5006 .local = LIST_HEAD_INITIALIZER(),
5007 .neigh = LIST_HEAD_INITIALIZER(),
5008 .ifouter = ifouter,
5009 .refcnt = 1,
5010 };
5011 /* Interface cleanup for new container created. */
5012 flow_tcf_encap_iface_cleanup(tcf, ifouter);
5013 flow_tcf_encap_local_cleanup(tcf, ifouter);
5014 flow_tcf_encap_neigh_cleanup(tcf, ifouter);
5015 LIST_INSERT_HEAD(&iface_list_vxlan, iface, next);
5016 return iface;
5017 }
5018
5019 /**
5020 * Releases VXLAN encap rules container by pointer. Decrements the
5021 * reference counter and deletes the container if counter is zero.
5022 *
5023 * @param[in] irule
5024 * VXLAN rule container pointer to release.
5025 */
5026 static void
5027 flow_tcf_encap_irule_release(struct tcf_irule *iface)
5028 {
5029 assert(iface->refcnt);
5030 if (--iface->refcnt == 0) {
5031 /* Reference counter is zero, delete the container. */
5032 assert(LIST_EMPTY(&iface->local));
5033 assert(LIST_EMPTY(&iface->neigh));
5034 LIST_REMOVE(iface, next);
5035 rte_free(iface);
5036 }
5037 }
5038
5039 /**
5040 * Deletes VTEP network device.
5041 *
5042 * @param[in] tcf
5043 * Context object initialized by mlx5_flow_tcf_context_create().
5044 * @param[in] vtep
5045 * Object representing the network device to delete. Memory
5046 * allocated for this object is freed by routine.
5047 */
5048 static void
5049 flow_tcf_vtep_delete(struct mlx5_flow_tcf_context *tcf,
5050 struct tcf_vtep *vtep)
5051 {
5052 struct nlmsghdr *nlh;
5053 struct ifinfomsg *ifm;
5054 alignas(struct nlmsghdr)
5055 uint8_t buf[mnl_nlmsg_size(MNL_ALIGN(sizeof(*ifm))) +
5056 MNL_BUF_EXTRA_SPACE];
5057 int ret;
5058
5059 assert(!vtep->refcnt);
5060 /* Delete only ifaces those we actually created. */
5061 if (vtep->created && vtep->ifindex) {
5062 DRV_LOG(INFO, "VTEP delete (%d)", vtep->ifindex);
5063 nlh = mnl_nlmsg_put_header(buf);
5064 nlh->nlmsg_type = RTM_DELLINK;
5065 nlh->nlmsg_flags = NLM_F_REQUEST;
5066 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
5067 ifm->ifi_family = AF_UNSPEC;
5068 ifm->ifi_index = vtep->ifindex;
5069 assert(sizeof(buf) >= nlh->nlmsg_len);
5070 ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5071 if (ret)
5072 DRV_LOG(WARNING, "netlink: error deleting vxlan"
5073 " encap/decap ifindex %u",
5074 ifm->ifi_index);
5075 }
5076 rte_free(vtep);
5077 }
5078
5079 /**
5080 * Creates VTEP network device.
5081 *
5082 * @param[in] tcf
5083 * Context object initialized by mlx5_flow_tcf_context_create().
5084 * @param[in] port
5085 * UDP port of created VTEP device.
5086 * @param[out] error
5087 * Perform verbose error reporting if not NULL.
5088 *
5089 * @return
5090 * Pointer to created device structure on success,
5091 * NULL otherwise and rte_errno is set.
5092 */
5093 static struct tcf_vtep*
5094 flow_tcf_vtep_create(struct mlx5_flow_tcf_context *tcf,
5095 uint16_t port, struct rte_flow_error *error)
5096 {
5097 struct tcf_vtep *vtep;
5098 struct nlmsghdr *nlh;
5099 struct ifinfomsg *ifm;
5100 char name[sizeof(MLX5_VXLAN_DEVICE_PFX) + 24];
5101 alignas(struct nlmsghdr)
5102 uint8_t buf[mnl_nlmsg_size(sizeof(*ifm)) +
5103 SZ_NLATTR_DATA_OF(sizeof(name)) +
5104 SZ_NLATTR_NEST * 2 +
5105 SZ_NLATTR_STRZ_OF("vxlan") +
5106 SZ_NLATTR_DATA_OF(sizeof(uint32_t)) +
5107 SZ_NLATTR_DATA_OF(sizeof(uint16_t)) +
5108 SZ_NLATTR_DATA_OF(sizeof(uint8_t)) * 3 +
5109 MNL_BUF_EXTRA_SPACE];
5110 struct nlattr *na_info;
5111 struct nlattr *na_vxlan;
5112 rte_be16_t vxlan_port = rte_cpu_to_be_16(port);
5113 int ret;
5114
5115 vtep = rte_zmalloc(__func__, sizeof(*vtep), alignof(struct tcf_vtep));
5116 if (!vtep) {
5117 rte_flow_error_set(error, ENOMEM,
5118 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5119 "unable to allocate memory for VTEP");
5120 return NULL;
5121 }
5122 *vtep = (struct tcf_vtep){
5123 .port = port,
5124 };
5125 memset(buf, 0, sizeof(buf));
5126 nlh = mnl_nlmsg_put_header(buf);
5127 nlh->nlmsg_type = RTM_NEWLINK;
5128 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
5129 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
5130 ifm->ifi_family = AF_UNSPEC;
5131 ifm->ifi_type = 0;
5132 ifm->ifi_index = 0;
5133 ifm->ifi_flags = IFF_UP;
5134 ifm->ifi_change = 0xffffffff;
5135 snprintf(name, sizeof(name), "%s%u", MLX5_VXLAN_DEVICE_PFX, port);
5136 mnl_attr_put_strz(nlh, IFLA_IFNAME, name);
5137 na_info = mnl_attr_nest_start(nlh, IFLA_LINKINFO);
5138 assert(na_info);
5139 mnl_attr_put_strz(nlh, IFLA_INFO_KIND, "vxlan");
5140 na_vxlan = mnl_attr_nest_start(nlh, IFLA_INFO_DATA);
5141 assert(na_vxlan);
5142 #ifdef HAVE_IFLA_VXLAN_COLLECT_METADATA
5143 /*
5144 * RH 7.2 does not support metadata for tunnel device.
5145 * It does not matter because we are going to use the
5146 * hardware offload by mlx5 driver.
5147 */
5148 mnl_attr_put_u8(nlh, IFLA_VXLAN_COLLECT_METADATA, 1);
5149 #endif
5150 mnl_attr_put_u8(nlh, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 1);
5151 mnl_attr_put_u8(nlh, IFLA_VXLAN_LEARNING, 0);
5152 mnl_attr_put_u16(nlh, IFLA_VXLAN_PORT, vxlan_port);
5153 #ifndef HAVE_IFLA_VXLAN_COLLECT_METADATA
5154 /*
5155 * We must specify VNI explicitly if metadata not supported.
5156 * Note, VNI is transferred with native endianness format.
5157 */
5158 mnl_attr_put_u16(nlh, IFLA_VXLAN_ID, MLX5_VXLAN_DEFAULT_VNI);
5159 #endif
5160 mnl_attr_nest_end(nlh, na_vxlan);
5161 mnl_attr_nest_end(nlh, na_info);
5162 assert(sizeof(buf) >= nlh->nlmsg_len);
5163 ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5164 if (ret) {
5165 DRV_LOG(WARNING,
5166 "netlink: VTEP %s create failure (%d)",
5167 name, rte_errno);
5168 if (rte_errno != EEXIST)
5169 /*
5170 * Some unhandled error occurred or device is
5171 * for encapsulation and cannot be shared.
5172 */
5173 goto error;
5174 } else {
5175 /*
5176 * Mark device we actually created.
5177 * We should explicitly delete
5178 * when we do not need it anymore.
5179 */
5180 vtep->created = 1;
5181 vtep->waitreg = 1;
5182 }
5183 /* Try to get ifindex of created of pre-existing device. */
5184 ret = if_nametoindex(name);
5185 if (!ret) {
5186 DRV_LOG(WARNING,
5187 "VTEP %s failed to get index (%d)", name, errno);
5188 rte_flow_error_set
5189 (error, -errno,
5190 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5191 "netlink: failed to retrieve VTEP ifindex");
5192 goto error;
5193 }
5194 vtep->ifindex = ret;
5195 memset(buf, 0, sizeof(buf));
5196 nlh = mnl_nlmsg_put_header(buf);
5197 nlh->nlmsg_type = RTM_NEWLINK;
5198 nlh->nlmsg_flags = NLM_F_REQUEST;
5199 ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
5200 ifm->ifi_family = AF_UNSPEC;
5201 ifm->ifi_type = 0;
5202 ifm->ifi_index = vtep->ifindex;
5203 ifm->ifi_flags = IFF_UP;
5204 ifm->ifi_change = IFF_UP;
5205 ret = flow_tcf_nl_ack(tcf, nlh, NULL, NULL);
5206 if (ret) {
5207 rte_flow_error_set(error, -errno,
5208 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5209 "netlink: failed to set VTEP link up");
5210 DRV_LOG(WARNING, "netlink: VTEP %s set link up failure (%d)",
5211 name, rte_errno);
5212 goto clean;
5213 }
5214 ret = mlx5_flow_tcf_init(tcf, vtep->ifindex, error);
5215 if (ret) {
5216 DRV_LOG(WARNING, "VTEP %s init failure (%d)", name, rte_errno);
5217 goto clean;
5218 }
5219 DRV_LOG(INFO, "VTEP create (%d, %d)", vtep->port, vtep->ifindex);
5220 vtep->refcnt = 1;
5221 return vtep;
5222 clean:
5223 flow_tcf_vtep_delete(tcf, vtep);
5224 return NULL;
5225 error:
5226 rte_free(vtep);
5227 return NULL;
5228 }
5229
5230 /**
5231 * Acquire target interface index for VXLAN tunneling decapsulation.
5232 * In order to share the UDP port within the other interfaces the
5233 * VXLAN device created as not attached to any interface (if created).
5234 *
5235 * @param[in] tcf
5236 * Context object initialized by mlx5_flow_tcf_context_create().
5237 * @param[in] dev_flow
5238 * Flow tcf object with tunnel structure pointer set.
5239 * @param[out] error
5240 * Perform verbose error reporting if not NULL.
5241 * @return
5242 * Interface descriptor pointer on success,
5243 * NULL otherwise and rte_errno is set.
5244 */
5245 static struct tcf_vtep*
5246 flow_tcf_decap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5247 struct mlx5_flow *dev_flow,
5248 struct rte_flow_error *error)
5249 {
5250 struct tcf_vtep *vtep;
5251 uint16_t port = dev_flow->tcf.vxlan_decap->udp_port;
5252
5253 LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5254 if (vtep->port == port)
5255 break;
5256 }
5257 if (vtep) {
5258 /* Device exists, just increment the reference counter. */
5259 vtep->refcnt++;
5260 assert(vtep->ifindex);
5261 return vtep;
5262 }
5263 /* No decapsulation device exists, try to create the new one. */
5264 vtep = flow_tcf_vtep_create(tcf, port, error);
5265 if (vtep)
5266 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
5267 return vtep;
5268 }
5269
5270 /**
5271 * Acquire target interface index for VXLAN tunneling encapsulation.
5272 *
5273 * @param[in] tcf
5274 * Context object initialized by mlx5_flow_tcf_context_create().
5275 * @param[in] ifouter
5276 * Network interface index to attach VXLAN encap device to.
5277 * @param[in] dev_flow
5278 * Flow tcf object with tunnel structure pointer set.
5279 * @param[out] error
5280 * Perform verbose error reporting if not NULL.
5281 * @return
5282 * Interface descriptor pointer on success,
5283 * NULL otherwise and rte_errno is set.
5284 */
5285 static struct tcf_vtep*
5286 flow_tcf_encap_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5287 unsigned int ifouter,
5288 struct mlx5_flow *dev_flow,
5289 struct rte_flow_error *error)
5290 {
5291 static uint16_t port;
5292 struct tcf_vtep *vtep;
5293 struct tcf_irule *iface;
5294 int ret;
5295
5296 assert(ifouter);
5297 /* Look whether the VTEP for specified port is created. */
5298 port = rte_be_to_cpu_16(dev_flow->tcf.vxlan_encap->udp.dst);
5299 LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
5300 if (vtep->port == port)
5301 break;
5302 }
5303 if (vtep) {
5304 /* VTEP already exists, just increment the reference. */
5305 vtep->refcnt++;
5306 } else {
5307 /* Not found, we should create the new VTEP. */
5308 vtep = flow_tcf_vtep_create(tcf, port, error);
5309 if (!vtep)
5310 return NULL;
5311 LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
5312 }
5313 assert(vtep->ifindex);
5314 iface = flow_tcf_encap_irule_acquire(tcf, ifouter, error);
5315 if (!iface) {
5316 if (--vtep->refcnt == 0)
5317 flow_tcf_vtep_delete(tcf, vtep);
5318 return NULL;
5319 }
5320 dev_flow->tcf.vxlan_encap->iface = iface;
5321 /* Create local ipaddr with peer to specify the outer IPs. */
5322 ret = flow_tcf_encap_local(tcf, iface, dev_flow, true, error);
5323 if (!ret) {
5324 /* Create neigh rule to specify outer destination MAC. */
5325 ret = flow_tcf_encap_neigh(tcf, iface, dev_flow, true, error);
5326 if (ret)
5327 flow_tcf_encap_local(tcf, iface,
5328 dev_flow, false, error);
5329 }
5330 if (ret) {
5331 dev_flow->tcf.vxlan_encap->iface = NULL;
5332 flow_tcf_encap_irule_release(iface);
5333 if (--vtep->refcnt == 0)
5334 flow_tcf_vtep_delete(tcf, vtep);
5335 return NULL;
5336 }
5337 return vtep;
5338 }
5339
5340 /**
5341 * Acquires target interface index for tunneling of any type.
5342 * Creates the new VTEP if needed.
5343 *
5344 * @param[in] tcf
5345 * Context object initialized by mlx5_flow_tcf_context_create().
5346 * @param[in] ifouter
5347 * Network interface index to create VXLAN encap rules on.
5348 * @param[in] dev_flow
5349 * Flow tcf object with tunnel structure pointer set.
5350 * @param[out] error
5351 * Perform verbose error reporting if not NULL.
5352 * @return
5353 * Interface descriptor pointer on success,
5354 * NULL otherwise and rte_errno is set.
5355 */
5356 static struct tcf_vtep*
5357 flow_tcf_vtep_acquire(struct mlx5_flow_tcf_context *tcf,
5358 unsigned int ifouter,
5359 struct mlx5_flow *dev_flow,
5360 struct rte_flow_error *error)
5361 {
5362 struct tcf_vtep *vtep = NULL;
5363
5364 assert(dev_flow->tcf.tunnel);
5365 pthread_mutex_lock(&vtep_list_mutex);
5366 switch (dev_flow->tcf.tunnel->type) {
5367 case FLOW_TCF_TUNACT_VXLAN_ENCAP:
5368 vtep = flow_tcf_encap_vtep_acquire(tcf, ifouter,
5369 dev_flow, error);
5370 break;
5371 case FLOW_TCF_TUNACT_VXLAN_DECAP:
5372 vtep = flow_tcf_decap_vtep_acquire(tcf, dev_flow, error);
5373 break;
5374 default:
5375 rte_flow_error_set(error, ENOTSUP,
5376 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5377 "unsupported tunnel type");
5378 break;
5379 }
5380 pthread_mutex_unlock(&vtep_list_mutex);
5381 return vtep;
5382 }
5383
5384 /**
5385 * Release tunneling interface by ifindex. Decrements reference
5386 * counter and actually removes the device if counter is zero.
5387 *
5388 * @param[in] tcf
5389 * Context object initialized by mlx5_flow_tcf_context_create().
5390 * @param[in] vtep
5391 * VTEP device descriptor structure.
5392 * @param[in] dev_flow
5393 * Flow tcf object with tunnel structure pointer set.
5394 */
5395 static void
5396 flow_tcf_vtep_release(struct mlx5_flow_tcf_context *tcf,
5397 struct tcf_vtep *vtep,
5398 struct mlx5_flow *dev_flow)
5399 {
5400 assert(dev_flow->tcf.tunnel);
5401 pthread_mutex_lock(&vtep_list_mutex);
5402 switch (dev_flow->tcf.tunnel->type) {
5403 case FLOW_TCF_TUNACT_VXLAN_DECAP:
5404 break;
5405 case FLOW_TCF_TUNACT_VXLAN_ENCAP: {
5406 struct tcf_irule *iface;
5407
5408 /* Remove the encap ancillary rules first. */
5409 iface = dev_flow->tcf.vxlan_encap->iface;
5410 assert(iface);
5411 flow_tcf_encap_neigh(tcf, iface, dev_flow, false, NULL);
5412 flow_tcf_encap_local(tcf, iface, dev_flow, false, NULL);
5413 flow_tcf_encap_irule_release(iface);
5414 dev_flow->tcf.vxlan_encap->iface = NULL;
5415 break;
5416 }
5417 default:
5418 assert(false);
5419 DRV_LOG(WARNING, "Unsupported tunnel type");
5420 break;
5421 }
5422 assert(vtep->refcnt);
5423 if (--vtep->refcnt == 0) {
5424 LIST_REMOVE(vtep, next);
5425 flow_tcf_vtep_delete(tcf, vtep);
5426 }
5427 pthread_mutex_unlock(&vtep_list_mutex);
5428 }
5429
5430 struct tcf_nlcb_query {
5431 uint32_t handle;
5432 uint32_t tc_flags;
5433 uint32_t flags_valid:1;
5434 };
5435
5436 /**
5437 * Collect queried rule attributes. This is callback routine called by
5438 * libmnl mnl_cb_run() in loop for every message in received packet.
5439 * Current implementation collects the flower flags only.
5440 *
5441 * @param[in] nlh
5442 * Pointer to reply header.
5443 * @param[in, out] arg
5444 * Context pointer for this callback.
5445 *
5446 * @return
5447 * A positive, nonzero value on success (required by libmnl
5448 * to continue messages processing).
5449 */
5450 static int
5451 flow_tcf_collect_query_cb(const struct nlmsghdr *nlh, void *arg)
5452 {
5453 struct tcf_nlcb_query *query = arg;
5454 struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
5455 struct nlattr *na, *na_opt;
5456 bool flower = false;
5457
5458 if (nlh->nlmsg_type != RTM_NEWTFILTER ||
5459 tcm->tcm_handle != query->handle)
5460 return 1;
5461 mnl_attr_for_each(na, nlh, sizeof(*tcm)) {
5462 switch (mnl_attr_get_type(na)) {
5463 case TCA_KIND:
5464 if (strcmp(mnl_attr_get_payload(na), "flower")) {
5465 /* Not flower filter, drop entire message. */
5466 return 1;
5467 }
5468 flower = true;
5469 break;
5470 case TCA_OPTIONS:
5471 if (!flower) {
5472 /* Not flower options, drop entire message. */
5473 return 1;
5474 }
5475 /* Check nested flower options. */
5476 mnl_attr_for_each_nested(na_opt, na) {
5477 switch (mnl_attr_get_type(na_opt)) {
5478 case TCA_FLOWER_FLAGS:
5479 query->flags_valid = 1;
5480 query->tc_flags =
5481 mnl_attr_get_u32(na_opt);
5482 break;
5483 }
5484 }
5485 break;
5486 }
5487 }
5488 return 1;
5489 }
5490
5491 /**
5492 * Query a TC flower rule flags via netlink.
5493 *
5494 * @param[in] tcf
5495 * Context object initialized by mlx5_flow_tcf_context_create().
5496 * @param[in] dev_flow
5497 * Pointer to the flow.
5498 * @param[out] pflags
5499 * pointer to the data retrieved by the query.
5500 *
5501 * @return
5502 * 0 on success, a negative errno value otherwise.
5503 */
5504 static int
5505 flow_tcf_query_flags(struct mlx5_flow_tcf_context *tcf,
5506 struct mlx5_flow *dev_flow,
5507 uint32_t *pflags)
5508 {
5509 struct nlmsghdr *nlh;
5510 struct tcmsg *tcm;
5511 struct tcf_nlcb_query query = {
5512 .handle = dev_flow->tcf.tcm->tcm_handle,
5513 };
5514
5515 nlh = mnl_nlmsg_put_header(tcf->buf);
5516 nlh->nlmsg_type = RTM_GETTFILTER;
5517 nlh->nlmsg_flags = NLM_F_REQUEST;
5518 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
5519 memcpy(tcm, dev_flow->tcf.tcm, sizeof(*tcm));
5520 /*
5521 * Ignore Netlink error for filter query operations.
5522 * The reply length is sent by kernel as errno.
5523 * Just check we got the flags option.
5524 */
5525 flow_tcf_nl_ack(tcf, nlh, flow_tcf_collect_query_cb, &query);
5526 if (!query.flags_valid) {
5527 *pflags = 0;
5528 return -ENOENT;
5529 }
5530 *pflags = query.tc_flags;
5531 return 0;
5532 }
5533
5534 /**
5535 * Query and check the in_hw set for specified rule.
5536 *
5537 * @param[in] tcf
5538 * Context object initialized by mlx5_flow_tcf_context_create().
5539 * @param[in] dev_flow
5540 * Pointer to the flow to check.
5541 *
5542 * @return
5543 * 0 on success, a negative errno value otherwise.
5544 */
5545 static int
5546 flow_tcf_check_inhw(struct mlx5_flow_tcf_context *tcf,
5547 struct mlx5_flow *dev_flow)
5548 {
5549 uint32_t flags;
5550 int ret;
5551
5552 ret = flow_tcf_query_flags(tcf, dev_flow, &flags);
5553 if (ret)
5554 return ret;
5555 return (flags & TCA_CLS_FLAGS_IN_HW) ? 0 : -ENOENT;
5556 }
5557
5558 /**
5559 * Remove flow from E-Switch by sending Netlink message.
5560 *
5561 * @param[in] dev
5562 * Pointer to Ethernet device.
5563 * @param[in, out] flow
5564 * Pointer to the sub flow.
5565 */
5566 static void
5567 flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
5568 {
5569 struct mlx5_priv *priv = dev->data->dev_private;
5570 struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5571 struct mlx5_flow *dev_flow;
5572 struct nlmsghdr *nlh;
5573 struct tcmsg *tcm;
5574
5575 if (!flow)
5576 return;
5577 dev_flow = LIST_FIRST(&flow->dev_flows);
5578 if (!dev_flow)
5579 return;
5580 /* E-Switch flow can't be expanded. */
5581 assert(!LIST_NEXT(dev_flow, next));
5582 if (dev_flow->tcf.applied) {
5583 nlh = dev_flow->tcf.nlh;
5584 nlh->nlmsg_type = RTM_DELTFILTER;
5585 nlh->nlmsg_flags = NLM_F_REQUEST;
5586 flow_tcf_nl_ack(ctx, nlh, NULL, NULL);
5587 if (dev_flow->tcf.tunnel) {
5588 assert(dev_flow->tcf.tunnel->vtep);
5589 flow_tcf_vtep_release(ctx,
5590 dev_flow->tcf.tunnel->vtep,
5591 dev_flow);
5592 dev_flow->tcf.tunnel->vtep = NULL;
5593 }
5594 /* Cleanup the rule handle value. */
5595 tcm = mnl_nlmsg_get_payload(nlh);
5596 tcm->tcm_handle = 0;
5597 dev_flow->tcf.applied = 0;
5598 }
5599 }
5600
5601 /**
5602 * Fetch the applied rule handle. This is callback routine called by
5603 * libmnl mnl_cb_run() in loop for every message in received packet.
5604 * When the NLM_F_ECHO flag is specified the kernel sends the created
5605 * rule descriptor back to the application and we can retrieve the
5606 * actual rule handle from updated descriptor.
5607 *
5608 * @param[in] nlh
5609 * Pointer to reply header.
5610 * @param[in, out] arg
5611 * Context pointer for this callback.
5612 *
5613 * @return
5614 * A positive, nonzero value on success (required by libmnl
5615 * to continue messages processing).
5616 */
5617 static int
5618 flow_tcf_collect_apply_cb(const struct nlmsghdr *nlh, void *arg)
5619 {
5620 struct nlmsghdr *nlhrq = arg;
5621 struct tcmsg *tcmrq = mnl_nlmsg_get_payload(nlhrq);
5622 struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
5623 struct nlattr *na;
5624
5625 if (nlh->nlmsg_type != RTM_NEWTFILTER ||
5626 nlh->nlmsg_seq != nlhrq->nlmsg_seq)
5627 return 1;
5628 mnl_attr_for_each(na, nlh, sizeof(*tcm)) {
5629 switch (mnl_attr_get_type(na)) {
5630 case TCA_KIND:
5631 if (strcmp(mnl_attr_get_payload(na), "flower")) {
5632 /* Not flower filter, drop entire message. */
5633 return 1;
5634 }
5635 tcmrq->tcm_handle = tcm->tcm_handle;
5636 return 1;
5637 }
5638 }
5639 return 1;
5640 }
5641 /**
5642 * Apply flow to E-Switch by sending Netlink message.
5643 *
5644 * @param[in] dev
5645 * Pointer to Ethernet device.
5646 * @param[in, out] flow
5647 * Pointer to the sub flow.
5648 * @param[out] error
5649 * Pointer to the error structure.
5650 *
5651 * @return
5652 * 0 on success, a negative errno value otherwise and rte_errno is set.
5653 */
5654 static int
5655 flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
5656 struct rte_flow_error *error)
5657 {
5658 struct mlx5_priv *priv = dev->data->dev_private;
5659 struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
5660 struct mlx5_flow *dev_flow;
5661 struct nlmsghdr *nlh;
5662 struct tcmsg *tcm;
5663 uint64_t start = 0;
5664 uint64_t twait = 0;
5665 int ret;
5666
5667 dev_flow = LIST_FIRST(&flow->dev_flows);
5668 /* E-Switch flow can't be expanded. */
5669 assert(!LIST_NEXT(dev_flow, next));
5670 if (dev_flow->tcf.applied)
5671 return 0;
5672 nlh = dev_flow->tcf.nlh;
5673 nlh->nlmsg_type = RTM_NEWTFILTER;
5674 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
5675 NLM_F_EXCL | NLM_F_ECHO;
5676 tcm = mnl_nlmsg_get_payload(nlh);
5677 /* Allow kernel to assign handle on its own. */
5678 tcm->tcm_handle = 0;
5679 if (dev_flow->tcf.tunnel) {
5680 /*
5681 * Replace the interface index, target for
5682 * encapsulation, source for decapsulation.
5683 */
5684 assert(!dev_flow->tcf.tunnel->vtep);
5685 assert(dev_flow->tcf.tunnel->ifindex_ptr);
5686 /* Acquire actual VTEP device when rule is being applied. */
5687 dev_flow->tcf.tunnel->vtep =
5688 flow_tcf_vtep_acquire(ctx,
5689 dev_flow->tcf.tunnel->ifindex_org,
5690 dev_flow, error);
5691 if (!dev_flow->tcf.tunnel->vtep)
5692 return -rte_errno;
5693 DRV_LOG(INFO, "Replace ifindex: %d->%d",
5694 dev_flow->tcf.tunnel->vtep->ifindex,
5695 dev_flow->tcf.tunnel->ifindex_org);
5696 *dev_flow->tcf.tunnel->ifindex_ptr =
5697 dev_flow->tcf.tunnel->vtep->ifindex;
5698 if (dev_flow->tcf.tunnel->vtep->waitreg) {
5699 /* Clear wait flag for VXLAN port registration. */
5700 dev_flow->tcf.tunnel->vtep->waitreg = 0;
5701 twait = rte_get_timer_hz();
5702 assert(twait > MS_PER_S);
5703 twait = twait * MLX5_VXLAN_WAIT_PORT_REG_MS;
5704 twait = twait / MS_PER_S;
5705 start = rte_get_timer_cycles();
5706 }
5707 }
5708 /*
5709 * Kernel creates the VXLAN devices and registers UDP ports to
5710 * be hardware offloaded within the NIC kernel drivers. The
5711 * registration process is being performed into context of
5712 * working kernel thread and the race conditions might happen.
5713 * The VXLAN device is created and success is returned to
5714 * calling application, but the UDP port registration process
5715 * is not completed yet. The next applied rule may be rejected
5716 * by the driver with ENOSUP code. We are going to wait a bit,
5717 * allowing registration process to be completed. The waiting
5718 * is performed once after device been created.
5719 */
5720 do {
5721 struct timespec onems;
5722
5723 ret = flow_tcf_nl_ack(ctx, nlh,
5724 flow_tcf_collect_apply_cb, nlh);
5725 if (!ret || ret != -ENOTSUP || !twait)
5726 break;
5727 /* Wait one millisecond and try again till timeout. */
5728 onems.tv_sec = 0;
5729 onems.tv_nsec = NS_PER_S / MS_PER_S;
5730 nanosleep(&onems, 0);
5731 if ((rte_get_timer_cycles() - start) > twait) {
5732 /* Timeout elapsed, try once more and exit. */
5733 twait = 0;
5734 }
5735 } while (true);
5736 if (!ret) {
5737 if (!tcm->tcm_handle) {
5738 flow_tcf_remove(dev, flow);
5739 return rte_flow_error_set
5740 (error, ENOENT,
5741 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5742 "netlink: rule zero handle returned");
5743 }
5744 dev_flow->tcf.applied = 1;
5745 if (*dev_flow->tcf.ptc_flags & TCA_CLS_FLAGS_SKIP_SW)
5746 return 0;
5747 /*
5748 * Rule was applied without skip_sw flag set.
5749 * We should check whether the rule was acctually
5750 * accepted by hardware (have look at in_hw flag).
5751 */
5752 if (flow_tcf_check_inhw(ctx, dev_flow)) {
5753 flow_tcf_remove(dev, flow);
5754 return rte_flow_error_set
5755 (error, ENOENT,
5756 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5757 "netlink: rule has no in_hw flag set");
5758 }
5759 return 0;
5760 }
5761 if (dev_flow->tcf.tunnel) {
5762 /* Rollback the VTEP configuration if rule apply failed. */
5763 assert(dev_flow->tcf.tunnel->vtep);
5764 flow_tcf_vtep_release(ctx, dev_flow->tcf.tunnel->vtep,
5765 dev_flow);
5766 dev_flow->tcf.tunnel->vtep = NULL;
5767 }
5768 return rte_flow_error_set(error, rte_errno,
5769 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
5770 "netlink: failed to create TC flow rule");
5771 }
5772
5773 /**
5774 * Remove flow from E-Switch and release resources of the device flow.
5775 *
5776 * @param[in] dev
5777 * Pointer to Ethernet device.
5778 * @param[in, out] flow
5779 * Pointer to the sub flow.
5780 */
5781 static void
5782 flow_tcf_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
5783 {
5784 struct mlx5_flow *dev_flow;
5785
5786 if (!flow)
5787 return;
5788 flow_tcf_remove(dev, flow);
5789 if (flow->counter) {
5790 if (--flow->counter->ref_cnt == 0) {
5791 rte_free(flow->counter);
5792 flow->counter = NULL;
5793 }
5794 }
5795 dev_flow = LIST_FIRST(&flow->dev_flows);
5796 if (!dev_flow)
5797 return;
5798 /* E-Switch flow can't be expanded. */
5799 assert(!LIST_NEXT(dev_flow, next));
5800 LIST_REMOVE(dev_flow, next);
5801 rte_free(dev_flow);
5802 }
5803
5804 /**
5805 * Helper routine for figuring the space size required for a parse buffer.
5806 *
5807 * @param array
5808 * array of values to use.
5809 * @param idx
5810 * Current location in array.
5811 * @param value
5812 * Value to compare with.
5813 *
5814 * @return
5815 * The maximum between the given value and the array value on index.
5816 */
5817 static uint16_t
5818 flow_tcf_arr_val_max(uint16_t array[], int idx, uint16_t value)
5819 {
5820 return idx < 0 ? (value) : RTE_MAX((array)[idx], value);
5821 }
5822
5823 /**
5824 * Parse rtnetlink message attributes filling the attribute table with the info
5825 * retrieved.
5826 *
5827 * @param tb
5828 * Attribute table to be filled.
5829 * @param[out] max
5830 * Maxinum entry in the attribute table.
5831 * @param rte
5832 * The attributes section in the message to be parsed.
5833 * @param len
5834 * The length of the attributes section in the message.
5835 */
5836 static void
5837 flow_tcf_nl_parse_rtattr(struct rtattr *tb[], int max,
5838 struct rtattr *rta, int len)
5839 {
5840 unsigned short type;
5841 memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
5842 while (RTA_OK(rta, len)) {
5843 type = rta->rta_type;
5844 if (type <= max && !tb[type])
5845 tb[type] = rta;
5846 rta = RTA_NEXT(rta, len);
5847 }
5848 }
5849
5850 /**
5851 * Extract flow counters from flower action.
5852 *
5853 * @param rta
5854 * flower action stats properties in the Netlink message received.
5855 * @param rta_type
5856 * The backward sequence of rta_types, as written in the attribute table,
5857 * we need to traverse in order to get to the requested object.
5858 * @param idx
5859 * Current location in rta_type table.
5860 * @param[out] data
5861 * data holding the count statistics of the rte_flow retrieved from
5862 * the message.
5863 *
5864 * @return
5865 * 0 if data was found and retrieved, -1 otherwise.
5866 */
5867 static int
5868 flow_tcf_nl_action_stats_parse_and_get(struct rtattr *rta,
5869 uint16_t rta_type[], int idx,
5870 struct gnet_stats_basic *data)
5871 {
5872 int tca_stats_max = flow_tcf_arr_val_max(rta_type, idx,
5873 TCA_STATS_BASIC);
5874 struct rtattr *tbs[tca_stats_max + 1];
5875
5876 if (rta == NULL || idx < 0)
5877 return -1;
5878 flow_tcf_nl_parse_rtattr(tbs, tca_stats_max,
5879 RTA_DATA(rta), RTA_PAYLOAD(rta));
5880 switch (rta_type[idx]) {
5881 case TCA_STATS_BASIC:
5882 if (tbs[TCA_STATS_BASIC]) {
5883 memcpy(data, RTA_DATA(tbs[TCA_STATS_BASIC]),
5884 RTE_MIN(RTA_PAYLOAD(tbs[TCA_STATS_BASIC]),
5885 sizeof(*data)));
5886 return 0;
5887 }
5888 break;
5889 default:
5890 break;
5891 }
5892 return -1;
5893 }
5894
5895 /**
5896 * Parse flower single action retrieving the requested action attribute,
5897 * if found.
5898 *
5899 * @param arg
5900 * flower action properties in the Netlink message received.
5901 * @param rta_type
5902 * The backward sequence of rta_types, as written in the attribute table,
5903 * we need to traverse in order to get to the requested object.
5904 * @param idx
5905 * Current location in rta_type table.
5906 * @param[out] data
5907 * Count statistics retrieved from the message query.
5908 *
5909 * @return
5910 * 0 if data was found and retrieved, -1 otherwise.
5911 */
5912 static int
5913 flow_tcf_nl_parse_one_action_and_get(struct rtattr *arg,
5914 uint16_t rta_type[], int idx, void *data)
5915 {
5916 int tca_act_max = flow_tcf_arr_val_max(rta_type, idx, TCA_ACT_STATS);
5917 struct rtattr *tb[tca_act_max + 1];
5918
5919 if (arg == NULL || idx < 0)
5920 return -1;
5921 flow_tcf_nl_parse_rtattr(tb, tca_act_max,
5922 RTA_DATA(arg), RTA_PAYLOAD(arg));
5923 if (tb[TCA_ACT_KIND] == NULL)
5924 return -1;
5925 switch (rta_type[idx]) {
5926 case TCA_ACT_STATS:
5927 if (tb[TCA_ACT_STATS])
5928 return flow_tcf_nl_action_stats_parse_and_get
5929 (tb[TCA_ACT_STATS],
5930 rta_type, --idx,
5931 (struct gnet_stats_basic *)data);
5932 break;
5933 default:
5934 break;
5935 }
5936 return -1;
5937 }
5938
5939 /**
5940 * Parse flower action section in the message retrieving the requested
5941 * attribute from the first action that provides it.
5942 *
5943 * @param opt
5944 * flower section in the Netlink message received.
5945 * @param rta_type
5946 * The backward sequence of rta_types, as written in the attribute table,
5947 * we need to traverse in order to get to the requested object.
5948 * @param idx
5949 * Current location in rta_type table.
5950 * @param[out] data
5951 * data retrieved from the message query.
5952 *
5953 * @return
5954 * 0 if data was found and retrieved, -1 otherwise.
5955 */
5956 static int
5957 flow_tcf_nl_action_parse_and_get(struct rtattr *arg,
5958 uint16_t rta_type[], int idx, void *data)
5959 {
5960 struct rtattr *tb[TCA_ACT_MAX_PRIO + 1];
5961 int i;
5962
5963 if (arg == NULL || idx < 0)
5964 return -1;
5965 flow_tcf_nl_parse_rtattr(tb, TCA_ACT_MAX_PRIO,
5966 RTA_DATA(arg), RTA_PAYLOAD(arg));
5967 switch (rta_type[idx]) {
5968 /*
5969 * flow counters are stored in the actions defined by the flow
5970 * and not in the flow itself, therefore we need to traverse the
5971 * flower chain of actions in search for them.
5972 *
5973 * Note that the index is not decremented here.
5974 */
5975 case TCA_ACT_STATS:
5976 for (i = 0; i <= TCA_ACT_MAX_PRIO; i++) {
5977 if (tb[i] &&
5978 !flow_tcf_nl_parse_one_action_and_get(tb[i],
5979 rta_type,
5980 idx, data))
5981 return 0;
5982 }
5983 break;
5984 default:
5985 break;
5986 }
5987 return -1;
5988 }
5989
5990 /**
5991 * Parse flower classifier options in the message, retrieving the requested
5992 * attribute if found.
5993 *
5994 * @param opt
5995 * flower section in the Netlink message received.
5996 * @param rta_type
5997 * The backward sequence of rta_types, as written in the attribute table,
5998 * we need to traverse in order to get to the requested object.
5999 * @param idx
6000 * Current location in rta_type table.
6001 * @param[out] data
6002 * data retrieved from the message query.
6003 *
6004 * @return
6005 * 0 if data was found and retrieved, -1 otherwise.
6006 */
6007 static int
6008 flow_tcf_nl_opts_parse_and_get(struct rtattr *opt,
6009 uint16_t rta_type[], int idx, void *data)
6010 {
6011 int tca_flower_max = flow_tcf_arr_val_max(rta_type, idx,
6012 TCA_FLOWER_ACT);
6013 struct rtattr *tb[tca_flower_max + 1];
6014
6015 if (!opt || idx < 0)
6016 return -1;
6017 flow_tcf_nl_parse_rtattr(tb, tca_flower_max,
6018 RTA_DATA(opt), RTA_PAYLOAD(opt));
6019 switch (rta_type[idx]) {
6020 case TCA_FLOWER_ACT:
6021 if (tb[TCA_FLOWER_ACT])
6022 return flow_tcf_nl_action_parse_and_get
6023 (tb[TCA_FLOWER_ACT],
6024 rta_type, --idx, data);
6025 break;
6026 default:
6027 break;
6028 }
6029 return -1;
6030 }
6031
6032 /**
6033 * Parse Netlink reply on filter query, retrieving the flow counters.
6034 *
6035 * @param nlh
6036 * Message received from Netlink.
6037 * @param rta_type
6038 * The backward sequence of rta_types, as written in the attribute table,
6039 * we need to traverse in order to get to the requested object.
6040 * @param idx
6041 * Current location in rta_type table.
6042 * @param[out] data
6043 * data retrieved from the message query.
6044 *
6045 * @return
6046 * 0 if data was found and retrieved, -1 otherwise.
6047 */
6048 static int
6049 flow_tcf_nl_filter_parse_and_get(struct nlmsghdr *cnlh,
6050 uint16_t rta_type[], int idx, void *data)
6051 {
6052 struct nlmsghdr *nlh = cnlh;
6053 struct tcmsg *t = NLMSG_DATA(nlh);
6054 int len = nlh->nlmsg_len;
6055 int tca_max = flow_tcf_arr_val_max(rta_type, idx, TCA_OPTIONS);
6056 struct rtattr *tb[tca_max + 1];
6057
6058 if (idx < 0)
6059 return -1;
6060 if (nlh->nlmsg_type != RTM_NEWTFILTER &&
6061 nlh->nlmsg_type != RTM_GETTFILTER &&
6062 nlh->nlmsg_type != RTM_DELTFILTER)
6063 return -1;
6064 len -= NLMSG_LENGTH(sizeof(*t));
6065 if (len < 0)
6066 return -1;
6067 flow_tcf_nl_parse_rtattr(tb, tca_max, TCA_RTA(t), len);
6068 /* Not a TC flower flow - bail out */
6069 if (!tb[TCA_KIND] ||
6070 strcmp(RTA_DATA(tb[TCA_KIND]), "flower"))
6071 return -1;
6072 switch (rta_type[idx]) {
6073 case TCA_OPTIONS:
6074 if (tb[TCA_OPTIONS])
6075 return flow_tcf_nl_opts_parse_and_get(tb[TCA_OPTIONS],
6076 rta_type,
6077 --idx, data);
6078 break;
6079 default:
6080 break;
6081 }
6082 return -1;
6083 }
6084
6085 /**
6086 * A callback to parse Netlink reply on TC flower query.
6087 *
6088 * @param nlh
6089 * Message received from Netlink.
6090 * @param[out] data
6091 * Pointer to data area to be filled by the parsing routine.
6092 * assumed to be a pointer to struct flow_tcf_stats_basic.
6093 *
6094 * @return
6095 * MNL_CB_OK value.
6096 */
6097 static int
6098 flow_tcf_nl_message_get_stats_basic(const struct nlmsghdr *nlh, void *data)
6099 {
6100 /*
6101 * The backward sequence of rta_types to pass in order to get
6102 * to the counters.
6103 */
6104 uint16_t rta_type[] = { TCA_STATS_BASIC, TCA_ACT_STATS,
6105 TCA_FLOWER_ACT, TCA_OPTIONS };
6106 struct flow_tcf_stats_basic *sb_data = data;
6107 union {
6108 const struct nlmsghdr *c;
6109 struct nlmsghdr *nc;
6110 } tnlh = { .c = nlh };
6111
6112 if (!flow_tcf_nl_filter_parse_and_get(tnlh.nc, rta_type,
6113 RTE_DIM(rta_type) - 1,
6114 (void *)&sb_data->counters))
6115 sb_data->valid = true;
6116 return MNL_CB_OK;
6117 }
6118
6119 /**
6120 * Query a TC flower rule for its statistics via netlink.
6121 *
6122 * @param[in] dev
6123 * Pointer to Ethernet device.
6124 * @param[in] flow
6125 * Pointer to the sub flow.
6126 * @param[out] data
6127 * data retrieved by the query.
6128 * @param[out] error
6129 * Perform verbose error reporting if not NULL.
6130 *
6131 * @return
6132 * 0 on success, a negative errno value otherwise and rte_errno is set.
6133 */
6134 static int
6135 flow_tcf_query_count(struct rte_eth_dev *dev,
6136 struct rte_flow *flow,
6137 void *data,
6138 struct rte_flow_error *error)
6139 {
6140 struct flow_tcf_stats_basic sb_data;
6141 struct rte_flow_query_count *qc = data;
6142 struct mlx5_priv *priv = dev->data->dev_private;
6143 struct mlx5_flow_tcf_context *ctx = priv->tcf_context;
6144 struct mnl_socket *nl = ctx->nl;
6145 struct mlx5_flow *dev_flow;
6146 struct nlmsghdr *nlh;
6147 uint32_t seq = priv->tcf_context->seq++;
6148 ssize_t ret;
6149 assert(qc);
6150
6151 memset(&sb_data, 0, sizeof(sb_data));
6152 dev_flow = LIST_FIRST(&flow->dev_flows);
6153 /* E-Switch flow can't be expanded. */
6154 assert(!LIST_NEXT(dev_flow, next));
6155 if (!dev_flow->flow->counter)
6156 goto notsup_exit;
6157 nlh = dev_flow->tcf.nlh;
6158 nlh->nlmsg_type = RTM_GETTFILTER;
6159 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ECHO;
6160 nlh->nlmsg_seq = seq;
6161 if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) == -1)
6162 goto error_exit;
6163 do {
6164 ret = mnl_socket_recvfrom(nl, ctx->buf, ctx->buf_size);
6165 if (ret <= 0)
6166 break;
6167 ret = mnl_cb_run(ctx->buf, ret, seq,
6168 mnl_socket_get_portid(nl),
6169 flow_tcf_nl_message_get_stats_basic,
6170 (void *)&sb_data);
6171 } while (ret > 0);
6172 /* Return the delta from last reset. */
6173 if (sb_data.valid) {
6174 /* Return the delta from last reset. */
6175 qc->hits_set = 1;
6176 qc->bytes_set = 1;
6177 qc->hits = sb_data.counters.packets - flow->counter->hits;
6178 qc->bytes = sb_data.counters.bytes - flow->counter->bytes;
6179 if (qc->reset) {
6180 flow->counter->hits = sb_data.counters.packets;
6181 flow->counter->bytes = sb_data.counters.bytes;
6182 }
6183 return 0;
6184 }
6185 return rte_flow_error_set(error, EINVAL,
6186 RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
6187 NULL,
6188 "flow does not have counter");
6189 error_exit:
6190 return rte_flow_error_set
6191 (error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
6192 NULL, "netlink: failed to read flow rule counters");
6193 notsup_exit:
6194 return rte_flow_error_set
6195 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
6196 NULL, "counters are not available.");
6197 }
6198
6199 /**
6200 * Query a flow.
6201 *
6202 * @see rte_flow_query()
6203 * @see rte_flow_ops
6204 */
6205 static int
6206 flow_tcf_query(struct rte_eth_dev *dev,
6207 struct rte_flow *flow,
6208 const struct rte_flow_action *actions,
6209 void *data,
6210 struct rte_flow_error *error)
6211 {
6212 int ret = -EINVAL;
6213
6214 for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
6215 switch (actions->type) {
6216 case RTE_FLOW_ACTION_TYPE_VOID:
6217 break;
6218 case RTE_FLOW_ACTION_TYPE_COUNT:
6219 ret = flow_tcf_query_count(dev, flow, data, error);
6220 break;
6221 default:
6222 return rte_flow_error_set(error, ENOTSUP,
6223 RTE_FLOW_ERROR_TYPE_ACTION,
6224 actions,
6225 "action not supported");
6226 }
6227 }
6228 return ret;
6229 }
6230
6231 const struct mlx5_flow_driver_ops mlx5_flow_tcf_drv_ops = {
6232 .validate = flow_tcf_validate,
6233 .prepare = flow_tcf_prepare,
6234 .translate = flow_tcf_translate,
6235 .apply = flow_tcf_apply,
6236 .remove = flow_tcf_remove,
6237 .destroy = flow_tcf_destroy,
6238 .query = flow_tcf_query,
6239 };
6240
6241 /**
6242 * Create and configure a libmnl socket for Netlink flow rules.
6243 *
6244 * @return
6245 * A valid libmnl socket object pointer on success, NULL otherwise and
6246 * rte_errno is set.
6247 */
6248 static struct mnl_socket *
6249 flow_tcf_mnl_socket_create(void)
6250 {
6251 struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
6252
6253 if (nl) {
6254 mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
6255 sizeof(int));
6256 if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
6257 return nl;
6258 }
6259 rte_errno = errno;
6260 if (nl)
6261 mnl_socket_close(nl);
6262 return NULL;
6263 }
6264
6265 /**
6266 * Destroy a libmnl socket.
6267 *
6268 * @param nl
6269 * Libmnl socket of the @p NETLINK_ROUTE kind.
6270 */
6271 static void
6272 flow_tcf_mnl_socket_destroy(struct mnl_socket *nl)
6273 {
6274 if (nl)
6275 mnl_socket_close(nl);
6276 }
6277
6278 /**
6279 * Initialize ingress qdisc of a given network interface.
6280 *
6281 * @param ctx
6282 * Pointer to tc-flower context to use.
6283 * @param ifindex
6284 * Index of network interface to initialize.
6285 * @param[out] error
6286 * Perform verbose error reporting if not NULL.
6287 *
6288 * @return
6289 * 0 on success, a negative errno value otherwise and rte_errno is set.
6290 */
6291 int
6292 mlx5_flow_tcf_init(struct mlx5_flow_tcf_context *ctx,
6293 unsigned int ifindex, struct rte_flow_error *error)
6294 {
6295 struct nlmsghdr *nlh;
6296 struct tcmsg *tcm;
6297 alignas(struct nlmsghdr)
6298 uint8_t buf[mnl_nlmsg_size(sizeof(*tcm)) +
6299 SZ_NLATTR_STRZ_OF("ingress") +
6300 MNL_BUF_EXTRA_SPACE];
6301
6302 /* Destroy existing ingress qdisc and everything attached to it. */
6303 nlh = mnl_nlmsg_put_header(buf);
6304 nlh->nlmsg_type = RTM_DELQDISC;
6305 nlh->nlmsg_flags = NLM_F_REQUEST;
6306 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
6307 tcm->tcm_family = AF_UNSPEC;
6308 tcm->tcm_ifindex = ifindex;
6309 tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
6310 tcm->tcm_parent = TC_H_INGRESS;
6311 assert(sizeof(buf) >= nlh->nlmsg_len);
6312 /* Ignore errors when qdisc is already absent. */
6313 if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL) &&
6314 rte_errno != EINVAL && rte_errno != ENOENT)
6315 return rte_flow_error_set(error, rte_errno,
6316 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
6317 "netlink: failed to remove ingress"
6318 " qdisc");
6319 /* Create fresh ingress qdisc. */
6320 nlh = mnl_nlmsg_put_header(buf);
6321 nlh->nlmsg_type = RTM_NEWQDISC;
6322 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
6323 tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
6324 tcm->tcm_family = AF_UNSPEC;
6325 tcm->tcm_ifindex = ifindex;
6326 tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
6327 tcm->tcm_parent = TC_H_INGRESS;
6328 mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
6329 assert(sizeof(buf) >= nlh->nlmsg_len);
6330 if (flow_tcf_nl_ack(ctx, nlh, NULL, NULL))
6331 return rte_flow_error_set(error, rte_errno,
6332 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
6333 "netlink: failed to create ingress"
6334 " qdisc");
6335 return 0;
6336 }
6337
6338 /**
6339 * Create libmnl context for Netlink flow rules.
6340 *
6341 * @return
6342 * A valid libmnl socket object pointer on success, NULL otherwise and
6343 * rte_errno is set.
6344 */
6345 struct mlx5_flow_tcf_context *
6346 mlx5_flow_tcf_context_create(void)
6347 {
6348 struct mlx5_flow_tcf_context *ctx = rte_zmalloc(__func__,
6349 sizeof(*ctx),
6350 sizeof(uint32_t));
6351 if (!ctx)
6352 goto error;
6353 ctx->nl = flow_tcf_mnl_socket_create();
6354 if (!ctx->nl)
6355 goto error;
6356 ctx->buf_size = MNL_SOCKET_BUFFER_SIZE;
6357 ctx->buf = rte_zmalloc(__func__,
6358 ctx->buf_size, sizeof(uint32_t));
6359 if (!ctx->buf)
6360 goto error;
6361 ctx->seq = random();
6362 return ctx;
6363 error:
6364 mlx5_flow_tcf_context_destroy(ctx);
6365 return NULL;
6366 }
6367
6368 /**
6369 * Destroy a libmnl context.
6370 *
6371 * @param ctx
6372 * Libmnl socket of the @p NETLINK_ROUTE kind.
6373 */
6374 void
6375 mlx5_flow_tcf_context_destroy(struct mlx5_flow_tcf_context *ctx)
6376 {
6377 if (!ctx)
6378 return;
6379 flow_tcf_mnl_socket_destroy(ctx->nl);
6380 rte_free(ctx->buf);
6381 rte_free(ctx);
6382 }