1 /* XDP redirect to CPUs via cpumap (BPF_MAP_TYPE_CPUMAP)
3 * GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
5 #include <uapi/linux/if_ether.h>
6 #include <uapi/linux/if_packet.h>
7 #include <uapi/linux/if_vlan.h>
8 #include <uapi/linux/ip.h>
9 #include <uapi/linux/ipv6.h>
10 #include <uapi/linux/in.h>
11 #include <uapi/linux/tcp.h>
12 #include <uapi/linux/udp.h>
14 #include <uapi/linux/bpf.h>
15 #include "bpf_helpers.h"
16 #include "hash_func01.h"
18 #define MAX_CPUS 64 /* WARNING - sync with _user.c */
20 /* Special map type that can XDP_REDIRECT frames to another CPU */
21 struct bpf_map_def
SEC("maps") cpu_map
= {
22 .type
= BPF_MAP_TYPE_CPUMAP
,
23 .key_size
= sizeof(u32
),
24 .value_size
= sizeof(u32
),
25 .max_entries
= MAX_CPUS
,
28 /* Common stats data record to keep userspace more simple */
35 /* Count RX packets, as XDP bpf_prog doesn't get direct TX-success
36 * feedback. Redirect TX errors can be caught via a tracepoint.
38 struct bpf_map_def
SEC("maps") rx_cnt
= {
39 .type
= BPF_MAP_TYPE_PERCPU_ARRAY
,
40 .key_size
= sizeof(u32
),
41 .value_size
= sizeof(struct datarec
),
45 /* Used by trace point */
46 struct bpf_map_def
SEC("maps") redirect_err_cnt
= {
47 .type
= BPF_MAP_TYPE_PERCPU_ARRAY
,
48 .key_size
= sizeof(u32
),
49 .value_size
= sizeof(struct datarec
),
51 /* TODO: have entries for all possible errno's */
54 /* Used by trace point */
55 struct bpf_map_def
SEC("maps") cpumap_enqueue_cnt
= {
56 .type
= BPF_MAP_TYPE_PERCPU_ARRAY
,
57 .key_size
= sizeof(u32
),
58 .value_size
= sizeof(struct datarec
),
59 .max_entries
= MAX_CPUS
,
62 /* Used by trace point */
63 struct bpf_map_def
SEC("maps") cpumap_kthread_cnt
= {
64 .type
= BPF_MAP_TYPE_PERCPU_ARRAY
,
65 .key_size
= sizeof(u32
),
66 .value_size
= sizeof(struct datarec
),
70 /* Set of maps controlling available CPU, and for iterating through
71 * selectable redirect CPUs.
73 struct bpf_map_def
SEC("maps") cpus_available
= {
74 .type
= BPF_MAP_TYPE_ARRAY
,
75 .key_size
= sizeof(u32
),
76 .value_size
= sizeof(u32
),
77 .max_entries
= MAX_CPUS
,
79 struct bpf_map_def
SEC("maps") cpus_count
= {
80 .type
= BPF_MAP_TYPE_ARRAY
,
81 .key_size
= sizeof(u32
),
82 .value_size
= sizeof(u32
),
85 struct bpf_map_def
SEC("maps") cpus_iterator
= {
86 .type
= BPF_MAP_TYPE_PERCPU_ARRAY
,
87 .key_size
= sizeof(u32
),
88 .value_size
= sizeof(u32
),
92 /* Used by trace point */
93 struct bpf_map_def
SEC("maps") exception_cnt
= {
94 .type
= BPF_MAP_TYPE_PERCPU_ARRAY
,
95 .key_size
= sizeof(u32
),
96 .value_size
= sizeof(struct datarec
),
100 /* Helper parse functions */
102 /* Parse Ethernet layer 2, extract network layer 3 offset and protocol
104 * Returns false on error and non-supported ether-type
108 __be16 h_vlan_encapsulated_proto
;
111 static __always_inline
112 bool parse_eth(struct ethhdr
*eth
, void *data_end
,
113 u16
*eth_proto
, u64
*l3_offset
)
118 offset
= sizeof(*eth
);
119 if ((void *)eth
+ offset
> data_end
)
122 eth_type
= eth
->h_proto
;
124 /* Skip non 802.3 Ethertypes */
125 if (unlikely(ntohs(eth_type
) < ETH_P_802_3_MIN
))
128 /* Handle VLAN tagged packet */
129 if (eth_type
== htons(ETH_P_8021Q
) || eth_type
== htons(ETH_P_8021AD
)) {
130 struct vlan_hdr
*vlan_hdr
;
132 vlan_hdr
= (void *)eth
+ offset
;
133 offset
+= sizeof(*vlan_hdr
);
134 if ((void *)eth
+ offset
> data_end
)
136 eth_type
= vlan_hdr
->h_vlan_encapsulated_proto
;
138 /* Handle double VLAN tagged packet */
139 if (eth_type
== htons(ETH_P_8021Q
) || eth_type
== htons(ETH_P_8021AD
)) {
140 struct vlan_hdr
*vlan_hdr
;
142 vlan_hdr
= (void *)eth
+ offset
;
143 offset
+= sizeof(*vlan_hdr
);
144 if ((void *)eth
+ offset
> data_end
)
146 eth_type
= vlan_hdr
->h_vlan_encapsulated_proto
;
149 *eth_proto
= ntohs(eth_type
);
154 static __always_inline
155 u16
get_dest_port_ipv4_udp(struct xdp_md
*ctx
, u64 nh_off
)
157 void *data_end
= (void *)(long)ctx
->data_end
;
158 void *data
= (void *)(long)ctx
->data
;
159 struct iphdr
*iph
= data
+ nh_off
;
163 if (iph
+ 1 > data_end
)
165 if (!(iph
->protocol
== IPPROTO_UDP
))
168 udph
= (void *)(iph
+ 1);
169 if (udph
+ 1 > data_end
)
172 dport
= ntohs(udph
->dest
);
176 static __always_inline
177 int get_proto_ipv4(struct xdp_md
*ctx
, u64 nh_off
)
179 void *data_end
= (void *)(long)ctx
->data_end
;
180 void *data
= (void *)(long)ctx
->data
;
181 struct iphdr
*iph
= data
+ nh_off
;
183 if (iph
+ 1 > data_end
)
185 return iph
->protocol
;
188 static __always_inline
189 int get_proto_ipv6(struct xdp_md
*ctx
, u64 nh_off
)
191 void *data_end
= (void *)(long)ctx
->data_end
;
192 void *data
= (void *)(long)ctx
->data
;
193 struct ipv6hdr
*ip6h
= data
+ nh_off
;
195 if (ip6h
+ 1 > data_end
)
197 return ip6h
->nexthdr
;
201 int xdp_prognum0_no_touch(struct xdp_md
*ctx
)
203 void *data_end
= (void *)(long)ctx
->data_end
;
204 void *data
= (void *)(long)ctx
->data
;
210 /* Only use first entry in cpus_available */
211 cpu_selected
= bpf_map_lookup_elem(&cpus_available
, &key
);
214 cpu_dest
= *cpu_selected
;
216 /* Count RX packet in map */
217 rec
= bpf_map_lookup_elem(&rx_cnt
, &key
);
222 if (cpu_dest
>= MAX_CPUS
) {
227 return bpf_redirect_map(&cpu_map
, cpu_dest
, 0);
230 SEC("xdp_cpu_map1_touch_data")
231 int xdp_prognum1_touch_data(struct xdp_md
*ctx
)
233 void *data_end
= (void *)(long)ctx
->data_end
;
234 void *data
= (void *)(long)ctx
->data
;
235 struct ethhdr
*eth
= data
;
242 /* Only use first entry in cpus_available */
243 cpu_selected
= bpf_map_lookup_elem(&cpus_available
, &key
);
246 cpu_dest
= *cpu_selected
;
248 /* Validate packet length is minimum Eth header size */
249 if (eth
+ 1 > data_end
)
252 /* Count RX packet in map */
253 rec
= bpf_map_lookup_elem(&rx_cnt
, &key
);
258 /* Read packet data, and use it (drop non 802.3 Ethertypes) */
259 eth_type
= eth
->h_proto
;
260 if (ntohs(eth_type
) < ETH_P_802_3_MIN
) {
265 if (cpu_dest
>= MAX_CPUS
) {
270 return bpf_redirect_map(&cpu_map
, cpu_dest
, 0);
273 SEC("xdp_cpu_map2_round_robin")
274 int xdp_prognum2_round_robin(struct xdp_md
*ctx
)
276 void *data_end
= (void *)(long)ctx
->data_end
;
277 void *data
= (void *)(long)ctx
->data
;
278 struct ethhdr
*eth
= data
;
289 cpu_max
= bpf_map_lookup_elem(&cpus_count
, &key0
);
293 cpu_iterator
= bpf_map_lookup_elem(&cpus_iterator
, &key0
);
296 cpu_idx
= *cpu_iterator
;
299 if (*cpu_iterator
== *cpu_max
)
302 cpu_selected
= bpf_map_lookup_elem(&cpus_available
, &cpu_idx
);
305 cpu_dest
= *cpu_selected
;
307 /* Count RX packet in map */
308 rec
= bpf_map_lookup_elem(&rx_cnt
, &key0
);
313 if (cpu_dest
>= MAX_CPUS
) {
318 return bpf_redirect_map(&cpu_map
, cpu_dest
, 0);
321 SEC("xdp_cpu_map3_proto_separate")
322 int xdp_prognum3_proto_separate(struct xdp_md
*ctx
)
324 void *data_end
= (void *)(long)ctx
->data_end
;
325 void *data
= (void *)(long)ctx
->data
;
326 struct ethhdr
*eth
= data
;
327 u8 ip_proto
= IPPROTO_UDP
;
336 /* Count RX packet in map */
337 rec
= bpf_map_lookup_elem(&rx_cnt
, &key
);
342 if (!(parse_eth(eth
, data_end
, ð_proto
, &l3_offset
)))
343 return XDP_PASS
; /* Just skip */
345 /* Extract L4 protocol */
348 ip_proto
= get_proto_ipv4(ctx
, l3_offset
);
351 ip_proto
= get_proto_ipv6(ctx
, l3_offset
);
354 cpu_idx
= 0; /* ARP packet handled on separate CPU */
360 /* Choose CPU based on L4 protocol */
376 cpu_lookup
= bpf_map_lookup_elem(&cpus_available
, &cpu_idx
);
379 cpu_dest
= *cpu_lookup
;
381 if (cpu_dest
>= MAX_CPUS
) {
386 return bpf_redirect_map(&cpu_map
, cpu_dest
, 0);
389 SEC("xdp_cpu_map4_ddos_filter_pktgen")
390 int xdp_prognum4_ddos_filter_pktgen(struct xdp_md
*ctx
)
392 void *data_end
= (void *)(long)ctx
->data_end
;
393 void *data
= (void *)(long)ctx
->data
;
394 struct ethhdr
*eth
= data
;
395 u8 ip_proto
= IPPROTO_UDP
;
405 /* Count RX packet in map */
406 rec
= bpf_map_lookup_elem(&rx_cnt
, &key
);
411 if (!(parse_eth(eth
, data_end
, ð_proto
, &l3_offset
)))
412 return XDP_PASS
; /* Just skip */
414 /* Extract L4 protocol */
417 ip_proto
= get_proto_ipv4(ctx
, l3_offset
);
420 ip_proto
= get_proto_ipv6(ctx
, l3_offset
);
423 cpu_idx
= 0; /* ARP packet handled on separate CPU */
429 /* Choose CPU based on L4 protocol */
440 /* DDoS filter UDP port 9 (pktgen) */
441 dest_port
= get_dest_port_ipv4_udp(ctx
, l3_offset
);
442 if (dest_port
== 9) {
452 cpu_lookup
= bpf_map_lookup_elem(&cpus_available
, &cpu_idx
);
455 cpu_dest
= *cpu_lookup
;
457 if (cpu_dest
>= MAX_CPUS
) {
462 return bpf_redirect_map(&cpu_map
, cpu_dest
, 0);
465 /* Hashing initval */
466 #define INITVAL 15485863
468 static __always_inline
469 u32
get_ipv4_hash_ip_pair(struct xdp_md
*ctx
, u64 nh_off
)
471 void *data_end
= (void *)(long)ctx
->data_end
;
472 void *data
= (void *)(long)ctx
->data
;
473 struct iphdr
*iph
= data
+ nh_off
;
476 if (iph
+ 1 > data_end
)
479 cpu_hash
= iph
->saddr
+ iph
->daddr
;
480 cpu_hash
= SuperFastHash((char *)&cpu_hash
, 4, INITVAL
+ iph
->protocol
);
485 static __always_inline
486 u32
get_ipv6_hash_ip_pair(struct xdp_md
*ctx
, u64 nh_off
)
488 void *data_end
= (void *)(long)ctx
->data_end
;
489 void *data
= (void *)(long)ctx
->data
;
490 struct ipv6hdr
*ip6h
= data
+ nh_off
;
493 if (ip6h
+ 1 > data_end
)
496 cpu_hash
= ip6h
->saddr
.s6_addr32
[0] + ip6h
->daddr
.s6_addr32
[0];
497 cpu_hash
+= ip6h
->saddr
.s6_addr32
[1] + ip6h
->daddr
.s6_addr32
[1];
498 cpu_hash
+= ip6h
->saddr
.s6_addr32
[2] + ip6h
->daddr
.s6_addr32
[2];
499 cpu_hash
+= ip6h
->saddr
.s6_addr32
[3] + ip6h
->daddr
.s6_addr32
[3];
500 cpu_hash
= SuperFastHash((char *)&cpu_hash
, 4, INITVAL
+ ip6h
->nexthdr
);
505 /* Load-Balance traffic based on hashing IP-addrs + L4-proto. The
506 * hashing scheme is symmetric, meaning swapping IP src/dest still hit
509 SEC("xdp_cpu_map5_lb_hash_ip_pairs")
510 int xdp_prognum5_lb_hash_ip_pairs(struct xdp_md
*ctx
)
512 void *data_end
= (void *)(long)ctx
->data_end
;
513 void *data
= (void *)(long)ctx
->data
;
514 struct ethhdr
*eth
= data
;
515 u8 ip_proto
= IPPROTO_UDP
;
526 /* Count RX packet in map */
527 rec
= bpf_map_lookup_elem(&rx_cnt
, &key
);
532 cpu_max
= bpf_map_lookup_elem(&cpus_count
, &key
);
536 if (!(parse_eth(eth
, data_end
, ð_proto
, &l3_offset
)))
537 return XDP_PASS
; /* Just skip */
539 /* Hash for IPv4 and IPv6 */
542 cpu_hash
= get_ipv4_hash_ip_pair(ctx
, l3_offset
);
545 cpu_hash
= get_ipv6_hash_ip_pair(ctx
, l3_offset
);
547 case ETH_P_ARP
: /* ARP packet handled on CPU idx 0 */
552 /* Choose CPU based on hash */
553 cpu_idx
= cpu_hash
% *cpu_max
;
555 cpu_lookup
= bpf_map_lookup_elem(&cpus_available
, &cpu_idx
);
558 cpu_dest
= *cpu_lookup
;
560 if (cpu_dest
>= MAX_CPUS
) {
565 return bpf_redirect_map(&cpu_map
, cpu_dest
, 0);
568 char _license
[] SEC("license") = "GPL";
570 /*** Trace point code ***/
572 /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
573 * Code in: kernel/include/trace/events/xdp.h
575 struct xdp_redirect_ctx
{
576 u64 __pad
; // First 8 bytes are not accessible by bpf code
577 int prog_id
; // offset:8; size:4; signed:1;
578 u32 act
; // offset:12 size:4; signed:0;
579 int ifindex
; // offset:16 size:4; signed:1;
580 int err
; // offset:20 size:4; signed:1;
581 int to_ifindex
; // offset:24 size:4; signed:1;
582 u32 map_id
; // offset:28 size:4; signed:0;
583 int map_index
; // offset:32 size:4; signed:1;
587 XDP_REDIRECT_SUCCESS
= 0,
588 XDP_REDIRECT_ERROR
= 1
591 static __always_inline
592 int xdp_redirect_collect_stat(struct xdp_redirect_ctx
*ctx
)
594 u32 key
= XDP_REDIRECT_ERROR
;
599 key
= XDP_REDIRECT_SUCCESS
;
601 rec
= bpf_map_lookup_elem(&redirect_err_cnt
, &key
);
606 return 0; /* Indicate event was filtered (no further processing)*/
608 * Returning 1 here would allow e.g. a perf-record tracepoint
609 * to see and record these events, but it doesn't work well
610 * in-practice as stopping perf-record also unload this
611 * bpf_prog. Plus, there is additional overhead of doing so.
615 SEC("tracepoint/xdp/xdp_redirect_err")
616 int trace_xdp_redirect_err(struct xdp_redirect_ctx
*ctx
)
618 return xdp_redirect_collect_stat(ctx
);
621 SEC("tracepoint/xdp/xdp_redirect_map_err")
622 int trace_xdp_redirect_map_err(struct xdp_redirect_ctx
*ctx
)
624 return xdp_redirect_collect_stat(ctx
);
627 /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format
628 * Code in: kernel/include/trace/events/xdp.h
630 struct xdp_exception_ctx
{
631 u64 __pad
; // First 8 bytes are not accessible by bpf code
632 int prog_id
; // offset:8; size:4; signed:1;
633 u32 act
; // offset:12; size:4; signed:0;
634 int ifindex
; // offset:16; size:4; signed:1;
637 SEC("tracepoint/xdp/xdp_exception")
638 int trace_xdp_exception(struct xdp_exception_ctx
*ctx
)
643 rec
= bpf_map_lookup_elem(&exception_cnt
, &key
);
651 /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
652 * Code in: kernel/include/trace/events/xdp.h
654 struct cpumap_enqueue_ctx
{
655 u64 __pad
; // First 8 bytes are not accessible by bpf code
656 int map_id
; // offset:8; size:4; signed:1;
657 u32 act
; // offset:12; size:4; signed:0;
658 int cpu
; // offset:16; size:4; signed:1;
659 unsigned int drops
; // offset:20; size:4; signed:0;
660 unsigned int processed
; // offset:24; size:4; signed:0;
661 int to_cpu
; // offset:28; size:4; signed:1;
664 SEC("tracepoint/xdp/xdp_cpumap_enqueue")
665 int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx
*ctx
)
667 u32 to_cpu
= ctx
->to_cpu
;
670 if (to_cpu
>= MAX_CPUS
)
673 rec
= bpf_map_lookup_elem(&cpumap_enqueue_cnt
, &to_cpu
);
676 rec
->processed
+= ctx
->processed
;
677 rec
->dropped
+= ctx
->drops
;
679 /* Record bulk events, then userspace can calc average bulk size */
680 if (ctx
->processed
> 0)
683 /* Inception: It's possible to detect overload situations, via
684 * this tracepoint. This can be used for creating a feedback
685 * loop to XDP, which can take appropriate actions to mitigate
686 * this overload situation.
691 /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format
692 * Code in: kernel/include/trace/events/xdp.h
694 struct cpumap_kthread_ctx
{
695 u64 __pad
; // First 8 bytes are not accessible by bpf code
696 int map_id
; // offset:8; size:4; signed:1;
697 u32 act
; // offset:12; size:4; signed:0;
698 int cpu
; // offset:16; size:4; signed:1;
699 unsigned int drops
; // offset:20; size:4; signed:0;
700 unsigned int processed
; // offset:24; size:4; signed:0;
701 int sched
; // offset:28; size:4; signed:1;
704 SEC("tracepoint/xdp/xdp_cpumap_kthread")
705 int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx
*ctx
)
710 rec
= bpf_map_lookup_elem(&cpumap_kthread_cnt
, &key
);
713 rec
->processed
+= ctx
->processed
;
714 rec
->dropped
+= ctx
->drops
;
716 /* Count times kthread yielded CPU via schedule call */