1 /* XDP redirect to CPUs via cpumap (BPF_MAP_TYPE_CPUMAP)
3 * GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
5 #include <uapi/linux/if_ether.h>
6 #include <uapi/linux/if_packet.h>
7 #include <uapi/linux/if_vlan.h>
8 #include <uapi/linux/ip.h>
9 #include <uapi/linux/ipv6.h>
10 #include <uapi/linux/in.h>
11 #include <uapi/linux/tcp.h>
12 #include <uapi/linux/udp.h>
14 #include <uapi/linux/bpf.h>
15 #include "bpf_helpers.h"
17 #define MAX_CPUS 12 /* WARNING - sync with _user.c */
19 /* Special map type that can XDP_REDIRECT frames to another CPU */
20 struct bpf_map_def
SEC("maps") cpu_map
= {
21 .type
= BPF_MAP_TYPE_CPUMAP
,
22 .key_size
= sizeof(u32
),
23 .value_size
= sizeof(u32
),
24 .max_entries
= MAX_CPUS
,
27 /* Common stats data record to keep userspace more simple */
34 /* Count RX packets, as XDP bpf_prog doesn't get direct TX-success
35 * feedback. Redirect TX errors can be caught via a tracepoint.
37 struct bpf_map_def
SEC("maps") rx_cnt
= {
38 .type
= BPF_MAP_TYPE_PERCPU_ARRAY
,
39 .key_size
= sizeof(u32
),
40 .value_size
= sizeof(struct datarec
),
44 /* Used by trace point */
45 struct bpf_map_def
SEC("maps") redirect_err_cnt
= {
46 .type
= BPF_MAP_TYPE_PERCPU_ARRAY
,
47 .key_size
= sizeof(u32
),
48 .value_size
= sizeof(struct datarec
),
50 /* TODO: have entries for all possible errno's */
53 /* Used by trace point */
54 struct bpf_map_def
SEC("maps") cpumap_enqueue_cnt
= {
55 .type
= BPF_MAP_TYPE_PERCPU_ARRAY
,
56 .key_size
= sizeof(u32
),
57 .value_size
= sizeof(struct datarec
),
58 .max_entries
= MAX_CPUS
,
61 /* Used by trace point */
62 struct bpf_map_def
SEC("maps") cpumap_kthread_cnt
= {
63 .type
= BPF_MAP_TYPE_PERCPU_ARRAY
,
64 .key_size
= sizeof(u32
),
65 .value_size
= sizeof(struct datarec
),
69 /* Set of maps controlling available CPU, and for iterating through
70 * selectable redirect CPUs.
72 struct bpf_map_def
SEC("maps") cpus_available
= {
73 .type
= BPF_MAP_TYPE_ARRAY
,
74 .key_size
= sizeof(u32
),
75 .value_size
= sizeof(u32
),
76 .max_entries
= MAX_CPUS
,
78 struct bpf_map_def
SEC("maps") cpus_count
= {
79 .type
= BPF_MAP_TYPE_ARRAY
,
80 .key_size
= sizeof(u32
),
81 .value_size
= sizeof(u32
),
84 struct bpf_map_def
SEC("maps") cpus_iterator
= {
85 .type
= BPF_MAP_TYPE_PERCPU_ARRAY
,
86 .key_size
= sizeof(u32
),
87 .value_size
= sizeof(u32
),
91 /* Used by trace point */
92 struct bpf_map_def
SEC("maps") exception_cnt
= {
93 .type
= BPF_MAP_TYPE_PERCPU_ARRAY
,
94 .key_size
= sizeof(u32
),
95 .value_size
= sizeof(struct datarec
),
99 /* Helper parse functions */
101 /* Parse Ethernet layer 2, extract network layer 3 offset and protocol
103 * Returns false on error and non-supported ether-type
107 __be16 h_vlan_encapsulated_proto
;
110 static __always_inline
111 bool parse_eth(struct ethhdr
*eth
, void *data_end
,
112 u16
*eth_proto
, u64
*l3_offset
)
117 offset
= sizeof(*eth
);
118 if ((void *)eth
+ offset
> data_end
)
121 eth_type
= eth
->h_proto
;
123 /* Skip non 802.3 Ethertypes */
124 if (unlikely(ntohs(eth_type
) < ETH_P_802_3_MIN
))
127 /* Handle VLAN tagged packet */
128 if (eth_type
== htons(ETH_P_8021Q
) || eth_type
== htons(ETH_P_8021AD
)) {
129 struct vlan_hdr
*vlan_hdr
;
131 vlan_hdr
= (void *)eth
+ offset
;
132 offset
+= sizeof(*vlan_hdr
);
133 if ((void *)eth
+ offset
> data_end
)
135 eth_type
= vlan_hdr
->h_vlan_encapsulated_proto
;
137 /* TODO: Handle double VLAN tagged packet */
139 *eth_proto
= ntohs(eth_type
);
144 static __always_inline
145 u16
get_dest_port_ipv4_udp(struct xdp_md
*ctx
, u64 nh_off
)
147 void *data_end
= (void *)(long)ctx
->data_end
;
148 void *data
= (void *)(long)ctx
->data
;
149 struct iphdr
*iph
= data
+ nh_off
;
153 if (iph
+ 1 > data_end
)
155 if (!(iph
->protocol
== IPPROTO_UDP
))
158 udph
= (void *)(iph
+ 1);
159 if (udph
+ 1 > data_end
)
162 dport
= ntohs(udph
->dest
);
166 static __always_inline
167 int get_proto_ipv4(struct xdp_md
*ctx
, u64 nh_off
)
169 void *data_end
= (void *)(long)ctx
->data_end
;
170 void *data
= (void *)(long)ctx
->data
;
171 struct iphdr
*iph
= data
+ nh_off
;
173 if (iph
+ 1 > data_end
)
175 return iph
->protocol
;
178 static __always_inline
179 int get_proto_ipv6(struct xdp_md
*ctx
, u64 nh_off
)
181 void *data_end
= (void *)(long)ctx
->data_end
;
182 void *data
= (void *)(long)ctx
->data
;
183 struct ipv6hdr
*ip6h
= data
+ nh_off
;
185 if (ip6h
+ 1 > data_end
)
187 return ip6h
->nexthdr
;
191 int xdp_prognum0_no_touch(struct xdp_md
*ctx
)
193 void *data_end
= (void *)(long)ctx
->data_end
;
194 void *data
= (void *)(long)ctx
->data
;
200 /* Only use first entry in cpus_available */
201 cpu_selected
= bpf_map_lookup_elem(&cpus_available
, &key
);
204 cpu_dest
= *cpu_selected
;
206 /* Count RX packet in map */
207 rec
= bpf_map_lookup_elem(&rx_cnt
, &key
);
212 if (cpu_dest
>= MAX_CPUS
) {
217 return bpf_redirect_map(&cpu_map
, cpu_dest
, 0);
220 SEC("xdp_cpu_map1_touch_data")
221 int xdp_prognum1_touch_data(struct xdp_md
*ctx
)
223 void *data_end
= (void *)(long)ctx
->data_end
;
224 void *data
= (void *)(long)ctx
->data
;
225 struct ethhdr
*eth
= data
;
232 /* Only use first entry in cpus_available */
233 cpu_selected
= bpf_map_lookup_elem(&cpus_available
, &key
);
236 cpu_dest
= *cpu_selected
;
238 /* Validate packet length is minimum Eth header size */
239 if (eth
+ 1 > data_end
)
242 /* Count RX packet in map */
243 rec
= bpf_map_lookup_elem(&rx_cnt
, &key
);
248 /* Read packet data, and use it (drop non 802.3 Ethertypes) */
249 eth_type
= eth
->h_proto
;
250 if (ntohs(eth_type
) < ETH_P_802_3_MIN
) {
255 if (cpu_dest
>= MAX_CPUS
) {
260 return bpf_redirect_map(&cpu_map
, cpu_dest
, 0);
263 SEC("xdp_cpu_map2_round_robin")
264 int xdp_prognum2_round_robin(struct xdp_md
*ctx
)
266 void *data_end
= (void *)(long)ctx
->data_end
;
267 void *data
= (void *)(long)ctx
->data
;
268 struct ethhdr
*eth
= data
;
279 cpu_max
= bpf_map_lookup_elem(&cpus_count
, &key0
);
283 cpu_iterator
= bpf_map_lookup_elem(&cpus_iterator
, &key0
);
286 cpu_idx
= *cpu_iterator
;
289 if (*cpu_iterator
== *cpu_max
)
292 cpu_selected
= bpf_map_lookup_elem(&cpus_available
, &cpu_idx
);
295 cpu_dest
= *cpu_selected
;
297 /* Count RX packet in map */
298 rec
= bpf_map_lookup_elem(&rx_cnt
, &key0
);
303 if (cpu_dest
>= MAX_CPUS
) {
308 return bpf_redirect_map(&cpu_map
, cpu_dest
, 0);
311 SEC("xdp_cpu_map3_proto_separate")
312 int xdp_prognum3_proto_separate(struct xdp_md
*ctx
)
314 void *data_end
= (void *)(long)ctx
->data_end
;
315 void *data
= (void *)(long)ctx
->data
;
316 struct ethhdr
*eth
= data
;
317 u8 ip_proto
= IPPROTO_UDP
;
326 /* Count RX packet in map */
327 rec
= bpf_map_lookup_elem(&rx_cnt
, &key
);
332 if (!(parse_eth(eth
, data_end
, ð_proto
, &l3_offset
)))
333 return XDP_PASS
; /* Just skip */
335 /* Extract L4 protocol */
338 ip_proto
= get_proto_ipv4(ctx
, l3_offset
);
341 ip_proto
= get_proto_ipv6(ctx
, l3_offset
);
344 cpu_idx
= 0; /* ARP packet handled on separate CPU */
350 /* Choose CPU based on L4 protocol */
366 cpu_lookup
= bpf_map_lookup_elem(&cpus_available
, &cpu_idx
);
369 cpu_dest
= *cpu_lookup
;
371 if (cpu_dest
>= MAX_CPUS
) {
376 return bpf_redirect_map(&cpu_map
, cpu_dest
, 0);
379 SEC("xdp_cpu_map4_ddos_filter_pktgen")
380 int xdp_prognum4_ddos_filter_pktgen(struct xdp_md
*ctx
)
382 void *data_end
= (void *)(long)ctx
->data_end
;
383 void *data
= (void *)(long)ctx
->data
;
384 struct ethhdr
*eth
= data
;
385 u8 ip_proto
= IPPROTO_UDP
;
395 /* Count RX packet in map */
396 rec
= bpf_map_lookup_elem(&rx_cnt
, &key
);
401 if (!(parse_eth(eth
, data_end
, ð_proto
, &l3_offset
)))
402 return XDP_PASS
; /* Just skip */
404 /* Extract L4 protocol */
407 ip_proto
= get_proto_ipv4(ctx
, l3_offset
);
410 ip_proto
= get_proto_ipv6(ctx
, l3_offset
);
413 cpu_idx
= 0; /* ARP packet handled on separate CPU */
419 /* Choose CPU based on L4 protocol */
430 /* DDoS filter UDP port 9 (pktgen) */
431 dest_port
= get_dest_port_ipv4_udp(ctx
, l3_offset
);
432 if (dest_port
== 9) {
442 cpu_lookup
= bpf_map_lookup_elem(&cpus_available
, &cpu_idx
);
445 cpu_dest
= *cpu_lookup
;
447 if (cpu_dest
>= MAX_CPUS
) {
452 return bpf_redirect_map(&cpu_map
, cpu_dest
, 0);
456 char _license
[] SEC("license") = "GPL";
458 /*** Trace point code ***/
460 /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
461 * Code in: kernel/include/trace/events/xdp.h
463 struct xdp_redirect_ctx
{
464 u64 __pad
; // First 8 bytes are not accessible by bpf code
465 int prog_id
; // offset:8; size:4; signed:1;
466 u32 act
; // offset:12 size:4; signed:0;
467 int ifindex
; // offset:16 size:4; signed:1;
468 int err
; // offset:20 size:4; signed:1;
469 int to_ifindex
; // offset:24 size:4; signed:1;
470 u32 map_id
; // offset:28 size:4; signed:0;
471 int map_index
; // offset:32 size:4; signed:1;
475 XDP_REDIRECT_SUCCESS
= 0,
476 XDP_REDIRECT_ERROR
= 1
479 static __always_inline
480 int xdp_redirect_collect_stat(struct xdp_redirect_ctx
*ctx
)
482 u32 key
= XDP_REDIRECT_ERROR
;
487 key
= XDP_REDIRECT_SUCCESS
;
489 rec
= bpf_map_lookup_elem(&redirect_err_cnt
, &key
);
494 return 0; /* Indicate event was filtered (no further processing)*/
496 * Returning 1 here would allow e.g. a perf-record tracepoint
497 * to see and record these events, but it doesn't work well
498 * in-practice as stopping perf-record also unload this
499 * bpf_prog. Plus, there is additional overhead of doing so.
503 SEC("tracepoint/xdp/xdp_redirect_err")
504 int trace_xdp_redirect_err(struct xdp_redirect_ctx
*ctx
)
506 return xdp_redirect_collect_stat(ctx
);
509 SEC("tracepoint/xdp/xdp_redirect_map_err")
510 int trace_xdp_redirect_map_err(struct xdp_redirect_ctx
*ctx
)
512 return xdp_redirect_collect_stat(ctx
);
515 /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format
516 * Code in: kernel/include/trace/events/xdp.h
518 struct xdp_exception_ctx
{
519 u64 __pad
; // First 8 bytes are not accessible by bpf code
520 int prog_id
; // offset:8; size:4; signed:1;
521 u32 act
; // offset:12; size:4; signed:0;
522 int ifindex
; // offset:16; size:4; signed:1;
525 SEC("tracepoint/xdp/xdp_exception")
526 int trace_xdp_exception(struct xdp_exception_ctx
*ctx
)
531 rec
= bpf_map_lookup_elem(&exception_cnt
, &key
);
539 /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
540 * Code in: kernel/include/trace/events/xdp.h
542 struct cpumap_enqueue_ctx
{
543 u64 __pad
; // First 8 bytes are not accessible by bpf code
544 int map_id
; // offset:8; size:4; signed:1;
545 u32 act
; // offset:12; size:4; signed:0;
546 int cpu
; // offset:16; size:4; signed:1;
547 unsigned int drops
; // offset:20; size:4; signed:0;
548 unsigned int processed
; // offset:24; size:4; signed:0;
549 int to_cpu
; // offset:28; size:4; signed:1;
552 SEC("tracepoint/xdp/xdp_cpumap_enqueue")
553 int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx
*ctx
)
555 u32 to_cpu
= ctx
->to_cpu
;
558 if (to_cpu
>= MAX_CPUS
)
561 rec
= bpf_map_lookup_elem(&cpumap_enqueue_cnt
, &to_cpu
);
564 rec
->processed
+= ctx
->processed
;
565 rec
->dropped
+= ctx
->drops
;
567 /* Record bulk events, then userspace can calc average bulk size */
568 if (ctx
->processed
> 0)
571 /* Inception: It's possible to detect overload situations, via
572 * this tracepoint. This can be used for creating a feedback
573 * loop to XDP, which can take appropriate actions to mitigate
574 * this overload situation.
579 /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format
580 * Code in: kernel/include/trace/events/xdp.h
582 struct cpumap_kthread_ctx
{
583 u64 __pad
; // First 8 bytes are not accessible by bpf code
584 int map_id
; // offset:8; size:4; signed:1;
585 u32 act
; // offset:12; size:4; signed:0;
586 int cpu
; // offset:16; size:4; signed:1;
587 unsigned int drops
; // offset:20; size:4; signed:0;
588 unsigned int processed
; // offset:24; size:4; signed:0;
589 int sched
; // offset:28; size:4; signed:1;
592 SEC("tracepoint/xdp/xdp_cpumap_kthread")
593 int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx
*ctx
)
598 rec
= bpf_map_lookup_elem(&cpumap_kthread_cnt
, &key
);
601 rec
->processed
+= ctx
->processed
;
602 rec
->dropped
+= ctx
->drops
;
604 /* Count times kthread yielded CPU via schedule call */