]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - samples/bpf/xdp_redirect_cpu_kern.c
Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[mirror_ubuntu-bionic-kernel.git] / samples / bpf / xdp_redirect_cpu_kern.c
1 /* XDP redirect to CPUs via cpumap (BPF_MAP_TYPE_CPUMAP)
2 *
3 * GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
4 */
5 #include <uapi/linux/if_ether.h>
6 #include <uapi/linux/if_packet.h>
7 #include <uapi/linux/if_vlan.h>
8 #include <uapi/linux/ip.h>
9 #include <uapi/linux/ipv6.h>
10 #include <uapi/linux/in.h>
11 #include <uapi/linux/tcp.h>
12 #include <uapi/linux/udp.h>
13
14 #include <uapi/linux/bpf.h>
15 #include "bpf_helpers.h"
16
17 #define MAX_CPUS 12 /* WARNING - sync with _user.c */
18
19 /* Special map type that can XDP_REDIRECT frames to another CPU */
20 struct bpf_map_def SEC("maps") cpu_map = {
21 .type = BPF_MAP_TYPE_CPUMAP,
22 .key_size = sizeof(u32),
23 .value_size = sizeof(u32),
24 .max_entries = MAX_CPUS,
25 };
26
27 /* Common stats data record to keep userspace more simple */
28 struct datarec {
29 __u64 processed;
30 __u64 dropped;
31 __u64 issue;
32 };
33
34 /* Count RX packets, as XDP bpf_prog doesn't get direct TX-success
35 * feedback. Redirect TX errors can be caught via a tracepoint.
36 */
37 struct bpf_map_def SEC("maps") rx_cnt = {
38 .type = BPF_MAP_TYPE_PERCPU_ARRAY,
39 .key_size = sizeof(u32),
40 .value_size = sizeof(struct datarec),
41 .max_entries = 1,
42 };
43
44 /* Used by trace point */
45 struct bpf_map_def SEC("maps") redirect_err_cnt = {
46 .type = BPF_MAP_TYPE_PERCPU_ARRAY,
47 .key_size = sizeof(u32),
48 .value_size = sizeof(struct datarec),
49 .max_entries = 2,
50 /* TODO: have entries for all possible errno's */
51 };
52
53 /* Used by trace point */
54 struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = {
55 .type = BPF_MAP_TYPE_PERCPU_ARRAY,
56 .key_size = sizeof(u32),
57 .value_size = sizeof(struct datarec),
58 .max_entries = MAX_CPUS,
59 };
60
61 /* Used by trace point */
62 struct bpf_map_def SEC("maps") cpumap_kthread_cnt = {
63 .type = BPF_MAP_TYPE_PERCPU_ARRAY,
64 .key_size = sizeof(u32),
65 .value_size = sizeof(struct datarec),
66 .max_entries = 1,
67 };
68
69 /* Set of maps controlling available CPU, and for iterating through
70 * selectable redirect CPUs.
71 */
72 struct bpf_map_def SEC("maps") cpus_available = {
73 .type = BPF_MAP_TYPE_ARRAY,
74 .key_size = sizeof(u32),
75 .value_size = sizeof(u32),
76 .max_entries = MAX_CPUS,
77 };
78 struct bpf_map_def SEC("maps") cpus_count = {
79 .type = BPF_MAP_TYPE_ARRAY,
80 .key_size = sizeof(u32),
81 .value_size = sizeof(u32),
82 .max_entries = 1,
83 };
84 struct bpf_map_def SEC("maps") cpus_iterator = {
85 .type = BPF_MAP_TYPE_PERCPU_ARRAY,
86 .key_size = sizeof(u32),
87 .value_size = sizeof(u32),
88 .max_entries = 1,
89 };
90
91 /* Used by trace point */
92 struct bpf_map_def SEC("maps") exception_cnt = {
93 .type = BPF_MAP_TYPE_PERCPU_ARRAY,
94 .key_size = sizeof(u32),
95 .value_size = sizeof(struct datarec),
96 .max_entries = 1,
97 };
98
99 /* Helper parse functions */
100
101 /* Parse Ethernet layer 2, extract network layer 3 offset and protocol
102 *
103 * Returns false on error and non-supported ether-type
104 */
105 struct vlan_hdr {
106 __be16 h_vlan_TCI;
107 __be16 h_vlan_encapsulated_proto;
108 };
109
110 static __always_inline
111 bool parse_eth(struct ethhdr *eth, void *data_end,
112 u16 *eth_proto, u64 *l3_offset)
113 {
114 u16 eth_type;
115 u64 offset;
116
117 offset = sizeof(*eth);
118 if ((void *)eth + offset > data_end)
119 return false;
120
121 eth_type = eth->h_proto;
122
123 /* Skip non 802.3 Ethertypes */
124 if (unlikely(ntohs(eth_type) < ETH_P_802_3_MIN))
125 return false;
126
127 /* Handle VLAN tagged packet */
128 if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) {
129 struct vlan_hdr *vlan_hdr;
130
131 vlan_hdr = (void *)eth + offset;
132 offset += sizeof(*vlan_hdr);
133 if ((void *)eth + offset > data_end)
134 return false;
135 eth_type = vlan_hdr->h_vlan_encapsulated_proto;
136 }
137 /* TODO: Handle double VLAN tagged packet */
138
139 *eth_proto = ntohs(eth_type);
140 *l3_offset = offset;
141 return true;
142 }
143
144 static __always_inline
145 u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off)
146 {
147 void *data_end = (void *)(long)ctx->data_end;
148 void *data = (void *)(long)ctx->data;
149 struct iphdr *iph = data + nh_off;
150 struct udphdr *udph;
151 u16 dport;
152
153 if (iph + 1 > data_end)
154 return 0;
155 if (!(iph->protocol == IPPROTO_UDP))
156 return 0;
157
158 udph = (void *)(iph + 1);
159 if (udph + 1 > data_end)
160 return 0;
161
162 dport = ntohs(udph->dest);
163 return dport;
164 }
165
166 static __always_inline
167 int get_proto_ipv4(struct xdp_md *ctx, u64 nh_off)
168 {
169 void *data_end = (void *)(long)ctx->data_end;
170 void *data = (void *)(long)ctx->data;
171 struct iphdr *iph = data + nh_off;
172
173 if (iph + 1 > data_end)
174 return 0;
175 return iph->protocol;
176 }
177
178 static __always_inline
179 int get_proto_ipv6(struct xdp_md *ctx, u64 nh_off)
180 {
181 void *data_end = (void *)(long)ctx->data_end;
182 void *data = (void *)(long)ctx->data;
183 struct ipv6hdr *ip6h = data + nh_off;
184
185 if (ip6h + 1 > data_end)
186 return 0;
187 return ip6h->nexthdr;
188 }
189
190 SEC("xdp_cpu_map0")
191 int xdp_prognum0_no_touch(struct xdp_md *ctx)
192 {
193 void *data_end = (void *)(long)ctx->data_end;
194 void *data = (void *)(long)ctx->data;
195 struct datarec *rec;
196 u32 *cpu_selected;
197 u32 cpu_dest;
198 u32 key = 0;
199
200 /* Only use first entry in cpus_available */
201 cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
202 if (!cpu_selected)
203 return XDP_ABORTED;
204 cpu_dest = *cpu_selected;
205
206 /* Count RX packet in map */
207 rec = bpf_map_lookup_elem(&rx_cnt, &key);
208 if (!rec)
209 return XDP_ABORTED;
210 rec->processed++;
211
212 if (cpu_dest >= MAX_CPUS) {
213 rec->issue++;
214 return XDP_ABORTED;
215 }
216
217 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
218 }
219
220 SEC("xdp_cpu_map1_touch_data")
221 int xdp_prognum1_touch_data(struct xdp_md *ctx)
222 {
223 void *data_end = (void *)(long)ctx->data_end;
224 void *data = (void *)(long)ctx->data;
225 struct ethhdr *eth = data;
226 struct datarec *rec;
227 u32 *cpu_selected;
228 u32 cpu_dest;
229 u16 eth_type;
230 u32 key = 0;
231
232 /* Only use first entry in cpus_available */
233 cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
234 if (!cpu_selected)
235 return XDP_ABORTED;
236 cpu_dest = *cpu_selected;
237
238 /* Validate packet length is minimum Eth header size */
239 if (eth + 1 > data_end)
240 return XDP_ABORTED;
241
242 /* Count RX packet in map */
243 rec = bpf_map_lookup_elem(&rx_cnt, &key);
244 if (!rec)
245 return XDP_ABORTED;
246 rec->processed++;
247
248 /* Read packet data, and use it (drop non 802.3 Ethertypes) */
249 eth_type = eth->h_proto;
250 if (ntohs(eth_type) < ETH_P_802_3_MIN) {
251 rec->dropped++;
252 return XDP_DROP;
253 }
254
255 if (cpu_dest >= MAX_CPUS) {
256 rec->issue++;
257 return XDP_ABORTED;
258 }
259
260 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
261 }
262
263 SEC("xdp_cpu_map2_round_robin")
264 int xdp_prognum2_round_robin(struct xdp_md *ctx)
265 {
266 void *data_end = (void *)(long)ctx->data_end;
267 void *data = (void *)(long)ctx->data;
268 struct ethhdr *eth = data;
269 struct datarec *rec;
270 u32 cpu_dest;
271 u32 *cpu_lookup;
272 u32 key0 = 0;
273
274 u32 *cpu_selected;
275 u32 *cpu_iterator;
276 u32 *cpu_max;
277 u32 cpu_idx;
278
279 cpu_max = bpf_map_lookup_elem(&cpus_count, &key0);
280 if (!cpu_max)
281 return XDP_ABORTED;
282
283 cpu_iterator = bpf_map_lookup_elem(&cpus_iterator, &key0);
284 if (!cpu_iterator)
285 return XDP_ABORTED;
286 cpu_idx = *cpu_iterator;
287
288 *cpu_iterator += 1;
289 if (*cpu_iterator == *cpu_max)
290 *cpu_iterator = 0;
291
292 cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
293 if (!cpu_selected)
294 return XDP_ABORTED;
295 cpu_dest = *cpu_selected;
296
297 /* Count RX packet in map */
298 rec = bpf_map_lookup_elem(&rx_cnt, &key0);
299 if (!rec)
300 return XDP_ABORTED;
301 rec->processed++;
302
303 if (cpu_dest >= MAX_CPUS) {
304 rec->issue++;
305 return XDP_ABORTED;
306 }
307
308 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
309 }
310
311 SEC("xdp_cpu_map3_proto_separate")
312 int xdp_prognum3_proto_separate(struct xdp_md *ctx)
313 {
314 void *data_end = (void *)(long)ctx->data_end;
315 void *data = (void *)(long)ctx->data;
316 struct ethhdr *eth = data;
317 u8 ip_proto = IPPROTO_UDP;
318 struct datarec *rec;
319 u16 eth_proto = 0;
320 u64 l3_offset = 0;
321 u32 cpu_dest = 0;
322 u32 cpu_idx = 0;
323 u32 *cpu_lookup;
324 u32 key = 0;
325
326 /* Count RX packet in map */
327 rec = bpf_map_lookup_elem(&rx_cnt, &key);
328 if (!rec)
329 return XDP_ABORTED;
330 rec->processed++;
331
332 if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
333 return XDP_PASS; /* Just skip */
334
335 /* Extract L4 protocol */
336 switch (eth_proto) {
337 case ETH_P_IP:
338 ip_proto = get_proto_ipv4(ctx, l3_offset);
339 break;
340 case ETH_P_IPV6:
341 ip_proto = get_proto_ipv6(ctx, l3_offset);
342 break;
343 case ETH_P_ARP:
344 cpu_idx = 0; /* ARP packet handled on separate CPU */
345 break;
346 default:
347 cpu_idx = 0;
348 }
349
350 /* Choose CPU based on L4 protocol */
351 switch (ip_proto) {
352 case IPPROTO_ICMP:
353 case IPPROTO_ICMPV6:
354 cpu_idx = 2;
355 break;
356 case IPPROTO_TCP:
357 cpu_idx = 0;
358 break;
359 case IPPROTO_UDP:
360 cpu_idx = 1;
361 break;
362 default:
363 cpu_idx = 0;
364 }
365
366 cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
367 if (!cpu_lookup)
368 return XDP_ABORTED;
369 cpu_dest = *cpu_lookup;
370
371 if (cpu_dest >= MAX_CPUS) {
372 rec->issue++;
373 return XDP_ABORTED;
374 }
375
376 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
377 }
378
379 SEC("xdp_cpu_map4_ddos_filter_pktgen")
380 int xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx)
381 {
382 void *data_end = (void *)(long)ctx->data_end;
383 void *data = (void *)(long)ctx->data;
384 struct ethhdr *eth = data;
385 u8 ip_proto = IPPROTO_UDP;
386 struct datarec *rec;
387 u16 eth_proto = 0;
388 u64 l3_offset = 0;
389 u32 cpu_dest = 0;
390 u32 cpu_idx = 0;
391 u16 dest_port;
392 u32 *cpu_lookup;
393 u32 key = 0;
394
395 /* Count RX packet in map */
396 rec = bpf_map_lookup_elem(&rx_cnt, &key);
397 if (!rec)
398 return XDP_ABORTED;
399 rec->processed++;
400
401 if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
402 return XDP_PASS; /* Just skip */
403
404 /* Extract L4 protocol */
405 switch (eth_proto) {
406 case ETH_P_IP:
407 ip_proto = get_proto_ipv4(ctx, l3_offset);
408 break;
409 case ETH_P_IPV6:
410 ip_proto = get_proto_ipv6(ctx, l3_offset);
411 break;
412 case ETH_P_ARP:
413 cpu_idx = 0; /* ARP packet handled on separate CPU */
414 break;
415 default:
416 cpu_idx = 0;
417 }
418
419 /* Choose CPU based on L4 protocol */
420 switch (ip_proto) {
421 case IPPROTO_ICMP:
422 case IPPROTO_ICMPV6:
423 cpu_idx = 2;
424 break;
425 case IPPROTO_TCP:
426 cpu_idx = 0;
427 break;
428 case IPPROTO_UDP:
429 cpu_idx = 1;
430 /* DDoS filter UDP port 9 (pktgen) */
431 dest_port = get_dest_port_ipv4_udp(ctx, l3_offset);
432 if (dest_port == 9) {
433 if (rec)
434 rec->dropped++;
435 return XDP_DROP;
436 }
437 break;
438 default:
439 cpu_idx = 0;
440 }
441
442 cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
443 if (!cpu_lookup)
444 return XDP_ABORTED;
445 cpu_dest = *cpu_lookup;
446
447 if (cpu_dest >= MAX_CPUS) {
448 rec->issue++;
449 return XDP_ABORTED;
450 }
451
452 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
453 }
454
455
456 char _license[] SEC("license") = "GPL";
457
458 /*** Trace point code ***/
459
460 /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
461 * Code in: kernel/include/trace/events/xdp.h
462 */
463 struct xdp_redirect_ctx {
464 u64 __pad; // First 8 bytes are not accessible by bpf code
465 int prog_id; // offset:8; size:4; signed:1;
466 u32 act; // offset:12 size:4; signed:0;
467 int ifindex; // offset:16 size:4; signed:1;
468 int err; // offset:20 size:4; signed:1;
469 int to_ifindex; // offset:24 size:4; signed:1;
470 u32 map_id; // offset:28 size:4; signed:0;
471 int map_index; // offset:32 size:4; signed:1;
472 }; // offset:36
473
474 enum {
475 XDP_REDIRECT_SUCCESS = 0,
476 XDP_REDIRECT_ERROR = 1
477 };
478
479 static __always_inline
480 int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx)
481 {
482 u32 key = XDP_REDIRECT_ERROR;
483 struct datarec *rec;
484 int err = ctx->err;
485
486 if (!err)
487 key = XDP_REDIRECT_SUCCESS;
488
489 rec = bpf_map_lookup_elem(&redirect_err_cnt, &key);
490 if (!rec)
491 return 0;
492 rec->dropped += 1;
493
494 return 0; /* Indicate event was filtered (no further processing)*/
495 /*
496 * Returning 1 here would allow e.g. a perf-record tracepoint
497 * to see and record these events, but it doesn't work well
498 * in-practice as stopping perf-record also unload this
499 * bpf_prog. Plus, there is additional overhead of doing so.
500 */
501 }
502
503 SEC("tracepoint/xdp/xdp_redirect_err")
504 int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx)
505 {
506 return xdp_redirect_collect_stat(ctx);
507 }
508
509 SEC("tracepoint/xdp/xdp_redirect_map_err")
510 int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx)
511 {
512 return xdp_redirect_collect_stat(ctx);
513 }
514
515 /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format
516 * Code in: kernel/include/trace/events/xdp.h
517 */
518 struct xdp_exception_ctx {
519 u64 __pad; // First 8 bytes are not accessible by bpf code
520 int prog_id; // offset:8; size:4; signed:1;
521 u32 act; // offset:12; size:4; signed:0;
522 int ifindex; // offset:16; size:4; signed:1;
523 };
524
525 SEC("tracepoint/xdp/xdp_exception")
526 int trace_xdp_exception(struct xdp_exception_ctx *ctx)
527 {
528 struct datarec *rec;
529 u32 key = 0;
530
531 rec = bpf_map_lookup_elem(&exception_cnt, &key);
532 if (!rec)
533 return 1;
534 rec->dropped += 1;
535
536 return 0;
537 }
538
539 /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
540 * Code in: kernel/include/trace/events/xdp.h
541 */
542 struct cpumap_enqueue_ctx {
543 u64 __pad; // First 8 bytes are not accessible by bpf code
544 int map_id; // offset:8; size:4; signed:1;
545 u32 act; // offset:12; size:4; signed:0;
546 int cpu; // offset:16; size:4; signed:1;
547 unsigned int drops; // offset:20; size:4; signed:0;
548 unsigned int processed; // offset:24; size:4; signed:0;
549 int to_cpu; // offset:28; size:4; signed:1;
550 };
551
552 SEC("tracepoint/xdp/xdp_cpumap_enqueue")
553 int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx)
554 {
555 u32 to_cpu = ctx->to_cpu;
556 struct datarec *rec;
557
558 if (to_cpu >= MAX_CPUS)
559 return 1;
560
561 rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu);
562 if (!rec)
563 return 0;
564 rec->processed += ctx->processed;
565 rec->dropped += ctx->drops;
566
567 /* Record bulk events, then userspace can calc average bulk size */
568 if (ctx->processed > 0)
569 rec->issue += 1;
570
571 /* Inception: It's possible to detect overload situations, via
572 * this tracepoint. This can be used for creating a feedback
573 * loop to XDP, which can take appropriate actions to mitigate
574 * this overload situation.
575 */
576 return 0;
577 }
578
579 /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format
580 * Code in: kernel/include/trace/events/xdp.h
581 */
582 struct cpumap_kthread_ctx {
583 u64 __pad; // First 8 bytes are not accessible by bpf code
584 int map_id; // offset:8; size:4; signed:1;
585 u32 act; // offset:12; size:4; signed:0;
586 int cpu; // offset:16; size:4; signed:1;
587 unsigned int drops; // offset:20; size:4; signed:0;
588 unsigned int processed; // offset:24; size:4; signed:0;
589 int sched; // offset:28; size:4; signed:1;
590 };
591
592 SEC("tracepoint/xdp/xdp_cpumap_kthread")
593 int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
594 {
595 struct datarec *rec;
596 u32 key = 0;
597
598 rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key);
599 if (!rec)
600 return 0;
601 rec->processed += ctx->processed;
602 rec->dropped += ctx->drops;
603
604 /* Count times kthread yielded CPU via schedule call */
605 if (ctx->sched)
606 rec->issue++;
607
608 return 0;
609 }