1 /* XDP redirect to CPUs via cpumap (BPF_MAP_TYPE_CPUMAP)
3 * GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
5 #include <uapi/linux/if_ether.h>
6 #include <uapi/linux/if_packet.h>
7 #include <uapi/linux/if_vlan.h>
8 #include <uapi/linux/ip.h>
9 #include <uapi/linux/ipv6.h>
10 #include <uapi/linux/in.h>
11 #include <uapi/linux/tcp.h>
12 #include <uapi/linux/udp.h>
14 #include <uapi/linux/bpf.h>
15 #include <bpf/bpf_helpers.h>
16 #include "hash_func01.h"
18 #define MAX_CPUS NR_CPUS
20 /* Special map type that can XDP_REDIRECT frames to another CPU */
22 __uint(type
, BPF_MAP_TYPE_CPUMAP
);
23 __uint(key_size
, sizeof(u32
));
24 __uint(value_size
, sizeof(struct bpf_cpumap_val
));
25 __uint(max_entries
, MAX_CPUS
);
26 } cpu_map
SEC(".maps");
28 /* Common stats data record to keep userspace more simple */
38 /* Count RX packets, as XDP bpf_prog doesn't get direct TX-success
39 * feedback. Redirect TX errors can be caught via a tracepoint.
42 __uint(type
, BPF_MAP_TYPE_PERCPU_ARRAY
);
44 __type(value
, struct datarec
);
45 __uint(max_entries
, 1);
46 } rx_cnt
SEC(".maps");
48 /* Used by trace point */
50 __uint(type
, BPF_MAP_TYPE_PERCPU_ARRAY
);
52 __type(value
, struct datarec
);
53 __uint(max_entries
, 2);
54 /* TODO: have entries for all possible errno's */
55 } redirect_err_cnt
SEC(".maps");
57 /* Used by trace point */
59 __uint(type
, BPF_MAP_TYPE_PERCPU_ARRAY
);
61 __type(value
, struct datarec
);
62 __uint(max_entries
, MAX_CPUS
);
63 } cpumap_enqueue_cnt
SEC(".maps");
65 /* Used by trace point */
67 __uint(type
, BPF_MAP_TYPE_PERCPU_ARRAY
);
69 __type(value
, struct datarec
);
70 __uint(max_entries
, 1);
71 } cpumap_kthread_cnt
SEC(".maps");
73 /* Set of maps controlling available CPU, and for iterating through
74 * selectable redirect CPUs.
77 __uint(type
, BPF_MAP_TYPE_ARRAY
);
80 __uint(max_entries
, MAX_CPUS
);
81 } cpus_available
SEC(".maps");
83 __uint(type
, BPF_MAP_TYPE_ARRAY
);
86 __uint(max_entries
, 1);
87 } cpus_count
SEC(".maps");
89 __uint(type
, BPF_MAP_TYPE_PERCPU_ARRAY
);
92 __uint(max_entries
, 1);
93 } cpus_iterator
SEC(".maps");
95 /* Used by trace point */
97 __uint(type
, BPF_MAP_TYPE_PERCPU_ARRAY
);
99 __type(value
, struct datarec
);
100 __uint(max_entries
, 1);
101 } exception_cnt
SEC(".maps");
103 /* Helper parse functions */
105 /* Parse Ethernet layer 2, extract network layer 3 offset and protocol
107 * Returns false on error and non-supported ether-type
111 __be16 h_vlan_encapsulated_proto
;
114 static __always_inline
115 bool parse_eth(struct ethhdr
*eth
, void *data_end
,
116 u16
*eth_proto
, u64
*l3_offset
)
121 offset
= sizeof(*eth
);
122 if ((void *)eth
+ offset
> data_end
)
125 eth_type
= eth
->h_proto
;
127 /* Skip non 802.3 Ethertypes */
128 if (unlikely(ntohs(eth_type
) < ETH_P_802_3_MIN
))
131 /* Handle VLAN tagged packet */
132 if (eth_type
== htons(ETH_P_8021Q
) || eth_type
== htons(ETH_P_8021AD
)) {
133 struct vlan_hdr
*vlan_hdr
;
135 vlan_hdr
= (void *)eth
+ offset
;
136 offset
+= sizeof(*vlan_hdr
);
137 if ((void *)eth
+ offset
> data_end
)
139 eth_type
= vlan_hdr
->h_vlan_encapsulated_proto
;
141 /* Handle double VLAN tagged packet */
142 if (eth_type
== htons(ETH_P_8021Q
) || eth_type
== htons(ETH_P_8021AD
)) {
143 struct vlan_hdr
*vlan_hdr
;
145 vlan_hdr
= (void *)eth
+ offset
;
146 offset
+= sizeof(*vlan_hdr
);
147 if ((void *)eth
+ offset
> data_end
)
149 eth_type
= vlan_hdr
->h_vlan_encapsulated_proto
;
152 *eth_proto
= ntohs(eth_type
);
157 static __always_inline
158 u16
get_dest_port_ipv4_udp(struct xdp_md
*ctx
, u64 nh_off
)
160 void *data_end
= (void *)(long)ctx
->data_end
;
161 void *data
= (void *)(long)ctx
->data
;
162 struct iphdr
*iph
= data
+ nh_off
;
166 if (iph
+ 1 > data_end
)
168 if (!(iph
->protocol
== IPPROTO_UDP
))
171 udph
= (void *)(iph
+ 1);
172 if (udph
+ 1 > data_end
)
175 dport
= ntohs(udph
->dest
);
179 static __always_inline
180 int get_proto_ipv4(struct xdp_md
*ctx
, u64 nh_off
)
182 void *data_end
= (void *)(long)ctx
->data_end
;
183 void *data
= (void *)(long)ctx
->data
;
184 struct iphdr
*iph
= data
+ nh_off
;
186 if (iph
+ 1 > data_end
)
188 return iph
->protocol
;
191 static __always_inline
192 int get_proto_ipv6(struct xdp_md
*ctx
, u64 nh_off
)
194 void *data_end
= (void *)(long)ctx
->data_end
;
195 void *data
= (void *)(long)ctx
->data
;
196 struct ipv6hdr
*ip6h
= data
+ nh_off
;
198 if (ip6h
+ 1 > data_end
)
200 return ip6h
->nexthdr
;
204 int xdp_prognum0_no_touch(struct xdp_md
*ctx
)
206 void *data_end
= (void *)(long)ctx
->data_end
;
207 void *data
= (void *)(long)ctx
->data
;
213 /* Only use first entry in cpus_available */
214 cpu_selected
= bpf_map_lookup_elem(&cpus_available
, &key
);
217 cpu_dest
= *cpu_selected
;
219 /* Count RX packet in map */
220 rec
= bpf_map_lookup_elem(&rx_cnt
, &key
);
225 if (cpu_dest
>= MAX_CPUS
) {
230 return bpf_redirect_map(&cpu_map
, cpu_dest
, 0);
233 SEC("xdp_cpu_map1_touch_data")
234 int xdp_prognum1_touch_data(struct xdp_md
*ctx
)
236 void *data_end
= (void *)(long)ctx
->data_end
;
237 void *data
= (void *)(long)ctx
->data
;
238 struct ethhdr
*eth
= data
;
245 /* Only use first entry in cpus_available */
246 cpu_selected
= bpf_map_lookup_elem(&cpus_available
, &key
);
249 cpu_dest
= *cpu_selected
;
251 /* Validate packet length is minimum Eth header size */
252 if (eth
+ 1 > data_end
)
255 /* Count RX packet in map */
256 rec
= bpf_map_lookup_elem(&rx_cnt
, &key
);
261 /* Read packet data, and use it (drop non 802.3 Ethertypes) */
262 eth_type
= eth
->h_proto
;
263 if (ntohs(eth_type
) < ETH_P_802_3_MIN
) {
268 if (cpu_dest
>= MAX_CPUS
) {
273 return bpf_redirect_map(&cpu_map
, cpu_dest
, 0);
276 SEC("xdp_cpu_map2_round_robin")
277 int xdp_prognum2_round_robin(struct xdp_md
*ctx
)
279 void *data_end
= (void *)(long)ctx
->data_end
;
280 void *data
= (void *)(long)ctx
->data
;
281 struct ethhdr
*eth
= data
;
292 cpu_max
= bpf_map_lookup_elem(&cpus_count
, &key0
);
296 cpu_iterator
= bpf_map_lookup_elem(&cpus_iterator
, &key0
);
299 cpu_idx
= *cpu_iterator
;
302 if (*cpu_iterator
== *cpu_max
)
305 cpu_selected
= bpf_map_lookup_elem(&cpus_available
, &cpu_idx
);
308 cpu_dest
= *cpu_selected
;
310 /* Count RX packet in map */
311 rec
= bpf_map_lookup_elem(&rx_cnt
, &key0
);
316 if (cpu_dest
>= MAX_CPUS
) {
321 return bpf_redirect_map(&cpu_map
, cpu_dest
, 0);
324 SEC("xdp_cpu_map3_proto_separate")
325 int xdp_prognum3_proto_separate(struct xdp_md
*ctx
)
327 void *data_end
= (void *)(long)ctx
->data_end
;
328 void *data
= (void *)(long)ctx
->data
;
329 struct ethhdr
*eth
= data
;
330 u8 ip_proto
= IPPROTO_UDP
;
339 /* Count RX packet in map */
340 rec
= bpf_map_lookup_elem(&rx_cnt
, &key
);
345 if (!(parse_eth(eth
, data_end
, ð_proto
, &l3_offset
)))
346 return XDP_PASS
; /* Just skip */
348 /* Extract L4 protocol */
351 ip_proto
= get_proto_ipv4(ctx
, l3_offset
);
354 ip_proto
= get_proto_ipv6(ctx
, l3_offset
);
357 cpu_idx
= 0; /* ARP packet handled on separate CPU */
363 /* Choose CPU based on L4 protocol */
379 cpu_lookup
= bpf_map_lookup_elem(&cpus_available
, &cpu_idx
);
382 cpu_dest
= *cpu_lookup
;
384 if (cpu_dest
>= MAX_CPUS
) {
389 return bpf_redirect_map(&cpu_map
, cpu_dest
, 0);
392 SEC("xdp_cpu_map4_ddos_filter_pktgen")
393 int xdp_prognum4_ddos_filter_pktgen(struct xdp_md
*ctx
)
395 void *data_end
= (void *)(long)ctx
->data_end
;
396 void *data
= (void *)(long)ctx
->data
;
397 struct ethhdr
*eth
= data
;
398 u8 ip_proto
= IPPROTO_UDP
;
408 /* Count RX packet in map */
409 rec
= bpf_map_lookup_elem(&rx_cnt
, &key
);
414 if (!(parse_eth(eth
, data_end
, ð_proto
, &l3_offset
)))
415 return XDP_PASS
; /* Just skip */
417 /* Extract L4 protocol */
420 ip_proto
= get_proto_ipv4(ctx
, l3_offset
);
423 ip_proto
= get_proto_ipv6(ctx
, l3_offset
);
426 cpu_idx
= 0; /* ARP packet handled on separate CPU */
432 /* Choose CPU based on L4 protocol */
443 /* DDoS filter UDP port 9 (pktgen) */
444 dest_port
= get_dest_port_ipv4_udp(ctx
, l3_offset
);
445 if (dest_port
== 9) {
455 cpu_lookup
= bpf_map_lookup_elem(&cpus_available
, &cpu_idx
);
458 cpu_dest
= *cpu_lookup
;
460 if (cpu_dest
>= MAX_CPUS
) {
465 return bpf_redirect_map(&cpu_map
, cpu_dest
, 0);
468 /* Hashing initval */
469 #define INITVAL 15485863
471 static __always_inline
472 u32
get_ipv4_hash_ip_pair(struct xdp_md
*ctx
, u64 nh_off
)
474 void *data_end
= (void *)(long)ctx
->data_end
;
475 void *data
= (void *)(long)ctx
->data
;
476 struct iphdr
*iph
= data
+ nh_off
;
479 if (iph
+ 1 > data_end
)
482 cpu_hash
= iph
->saddr
+ iph
->daddr
;
483 cpu_hash
= SuperFastHash((char *)&cpu_hash
, 4, INITVAL
+ iph
->protocol
);
488 static __always_inline
489 u32
get_ipv6_hash_ip_pair(struct xdp_md
*ctx
, u64 nh_off
)
491 void *data_end
= (void *)(long)ctx
->data_end
;
492 void *data
= (void *)(long)ctx
->data
;
493 struct ipv6hdr
*ip6h
= data
+ nh_off
;
496 if (ip6h
+ 1 > data_end
)
499 cpu_hash
= ip6h
->saddr
.s6_addr32
[0] + ip6h
->daddr
.s6_addr32
[0];
500 cpu_hash
+= ip6h
->saddr
.s6_addr32
[1] + ip6h
->daddr
.s6_addr32
[1];
501 cpu_hash
+= ip6h
->saddr
.s6_addr32
[2] + ip6h
->daddr
.s6_addr32
[2];
502 cpu_hash
+= ip6h
->saddr
.s6_addr32
[3] + ip6h
->daddr
.s6_addr32
[3];
503 cpu_hash
= SuperFastHash((char *)&cpu_hash
, 4, INITVAL
+ ip6h
->nexthdr
);
508 /* Load-Balance traffic based on hashing IP-addrs + L4-proto. The
509 * hashing scheme is symmetric, meaning swapping IP src/dest still hit
512 SEC("xdp_cpu_map5_lb_hash_ip_pairs")
513 int xdp_prognum5_lb_hash_ip_pairs(struct xdp_md
*ctx
)
515 void *data_end
= (void *)(long)ctx
->data_end
;
516 void *data
= (void *)(long)ctx
->data
;
517 struct ethhdr
*eth
= data
;
518 u8 ip_proto
= IPPROTO_UDP
;
529 /* Count RX packet in map */
530 rec
= bpf_map_lookup_elem(&rx_cnt
, &key
);
535 cpu_max
= bpf_map_lookup_elem(&cpus_count
, &key
);
539 if (!(parse_eth(eth
, data_end
, ð_proto
, &l3_offset
)))
540 return XDP_PASS
; /* Just skip */
542 /* Hash for IPv4 and IPv6 */
545 cpu_hash
= get_ipv4_hash_ip_pair(ctx
, l3_offset
);
548 cpu_hash
= get_ipv6_hash_ip_pair(ctx
, l3_offset
);
550 case ETH_P_ARP
: /* ARP packet handled on CPU idx 0 */
555 /* Choose CPU based on hash */
556 cpu_idx
= cpu_hash
% *cpu_max
;
558 cpu_lookup
= bpf_map_lookup_elem(&cpus_available
, &cpu_idx
);
561 cpu_dest
= *cpu_lookup
;
563 if (cpu_dest
>= MAX_CPUS
) {
568 return bpf_redirect_map(&cpu_map
, cpu_dest
, 0);
571 char _license
[] SEC("license") = "GPL";
573 /*** Trace point code ***/
575 /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
576 * Code in: kernel/include/trace/events/xdp.h
578 struct xdp_redirect_ctx
{
579 u64 __pad
; // First 8 bytes are not accessible by bpf code
580 int prog_id
; // offset:8; size:4; signed:1;
581 u32 act
; // offset:12 size:4; signed:0;
582 int ifindex
; // offset:16 size:4; signed:1;
583 int err
; // offset:20 size:4; signed:1;
584 int to_ifindex
; // offset:24 size:4; signed:1;
585 u32 map_id
; // offset:28 size:4; signed:0;
586 int map_index
; // offset:32 size:4; signed:1;
590 XDP_REDIRECT_SUCCESS
= 0,
591 XDP_REDIRECT_ERROR
= 1
594 static __always_inline
595 int xdp_redirect_collect_stat(struct xdp_redirect_ctx
*ctx
)
597 u32 key
= XDP_REDIRECT_ERROR
;
602 key
= XDP_REDIRECT_SUCCESS
;
604 rec
= bpf_map_lookup_elem(&redirect_err_cnt
, &key
);
609 return 0; /* Indicate event was filtered (no further processing)*/
611 * Returning 1 here would allow e.g. a perf-record tracepoint
612 * to see and record these events, but it doesn't work well
613 * in-practice as stopping perf-record also unload this
614 * bpf_prog. Plus, there is additional overhead of doing so.
618 SEC("tracepoint/xdp/xdp_redirect_err")
619 int trace_xdp_redirect_err(struct xdp_redirect_ctx
*ctx
)
621 return xdp_redirect_collect_stat(ctx
);
624 SEC("tracepoint/xdp/xdp_redirect_map_err")
625 int trace_xdp_redirect_map_err(struct xdp_redirect_ctx
*ctx
)
627 return xdp_redirect_collect_stat(ctx
);
630 /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format
631 * Code in: kernel/include/trace/events/xdp.h
633 struct xdp_exception_ctx
{
634 u64 __pad
; // First 8 bytes are not accessible by bpf code
635 int prog_id
; // offset:8; size:4; signed:1;
636 u32 act
; // offset:12; size:4; signed:0;
637 int ifindex
; // offset:16; size:4; signed:1;
640 SEC("tracepoint/xdp/xdp_exception")
641 int trace_xdp_exception(struct xdp_exception_ctx
*ctx
)
646 rec
= bpf_map_lookup_elem(&exception_cnt
, &key
);
654 /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
655 * Code in: kernel/include/trace/events/xdp.h
657 struct cpumap_enqueue_ctx
{
658 u64 __pad
; // First 8 bytes are not accessible by bpf code
659 int map_id
; // offset:8; size:4; signed:1;
660 u32 act
; // offset:12; size:4; signed:0;
661 int cpu
; // offset:16; size:4; signed:1;
662 unsigned int drops
; // offset:20; size:4; signed:0;
663 unsigned int processed
; // offset:24; size:4; signed:0;
664 int to_cpu
; // offset:28; size:4; signed:1;
667 SEC("tracepoint/xdp/xdp_cpumap_enqueue")
668 int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx
*ctx
)
670 u32 to_cpu
= ctx
->to_cpu
;
673 if (to_cpu
>= MAX_CPUS
)
676 rec
= bpf_map_lookup_elem(&cpumap_enqueue_cnt
, &to_cpu
);
679 rec
->processed
+= ctx
->processed
;
680 rec
->dropped
+= ctx
->drops
;
682 /* Record bulk events, then userspace can calc average bulk size */
683 if (ctx
->processed
> 0)
686 /* Inception: It's possible to detect overload situations, via
687 * this tracepoint. This can be used for creating a feedback
688 * loop to XDP, which can take appropriate actions to mitigate
689 * this overload situation.
694 /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format
695 * Code in: kernel/include/trace/events/xdp.h
697 struct cpumap_kthread_ctx
{
698 u64 __pad
; // First 8 bytes are not accessible
699 int map_id
; // offset:8; size:4; signed:1;
700 u32 act
; // offset:12; size:4; signed:0;
701 int cpu
; // offset:16; size:4; signed:1;
702 unsigned int drops
; // offset:20; size:4; signed:0;
703 unsigned int processed
; // offset:24; size:4; signed:0;
704 int sched
; // offset:28; size:4; signed:1;
705 unsigned int xdp_pass
; // offset:32; size:4; signed:0;
706 unsigned int xdp_drop
; // offset:36; size:4; signed:0;
707 unsigned int xdp_redirect
; // offset:40; size:4; signed:0;
710 SEC("tracepoint/xdp/xdp_cpumap_kthread")
711 int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx
*ctx
)
716 rec
= bpf_map_lookup_elem(&cpumap_kthread_cnt
, &key
);
719 rec
->processed
+= ctx
->processed
;
720 rec
->dropped
+= ctx
->drops
;
721 rec
->xdp_pass
+= ctx
->xdp_pass
;
722 rec
->xdp_drop
+= ctx
->xdp_drop
;
723 rec
->xdp_redirect
+= ctx
->xdp_redirect
;
725 /* Count times kthread yielded CPU via schedule call */