]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - samples/bpf/xdp_redirect_cpu_kern.c
Merge tag 'arc-4.15-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc
[mirror_ubuntu-bionic-kernel.git] / samples / bpf / xdp_redirect_cpu_kern.c
CommitLineData
fad3917e
JDB
1/* XDP redirect to CPUs via cpumap (BPF_MAP_TYPE_CPUMAP)
2 *
3 * GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
4 */
5#include <uapi/linux/if_ether.h>
6#include <uapi/linux/if_packet.h>
7#include <uapi/linux/if_vlan.h>
8#include <uapi/linux/ip.h>
9#include <uapi/linux/ipv6.h>
10#include <uapi/linux/in.h>
11#include <uapi/linux/tcp.h>
12#include <uapi/linux/udp.h>
13
14#include <uapi/linux/bpf.h>
15#include "bpf_helpers.h"
16
17#define MAX_CPUS 12 /* WARNING - sync with _user.c */
18
19/* Special map type that can XDP_REDIRECT frames to another CPU */
20struct bpf_map_def SEC("maps") cpu_map = {
21 .type = BPF_MAP_TYPE_CPUMAP,
22 .key_size = sizeof(u32),
23 .value_size = sizeof(u32),
24 .max_entries = MAX_CPUS,
25};
26
27/* Common stats data record to keep userspace more simple */
28struct datarec {
29 __u64 processed;
30 __u64 dropped;
31 __u64 issue;
32};
33
34/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success
35 * feedback. Redirect TX errors can be caught via a tracepoint.
36 */
37struct bpf_map_def SEC("maps") rx_cnt = {
38 .type = BPF_MAP_TYPE_PERCPU_ARRAY,
39 .key_size = sizeof(u32),
40 .value_size = sizeof(struct datarec),
41 .max_entries = 1,
42};
43
44/* Used by trace point */
45struct bpf_map_def SEC("maps") redirect_err_cnt = {
46 .type = BPF_MAP_TYPE_PERCPU_ARRAY,
47 .key_size = sizeof(u32),
48 .value_size = sizeof(struct datarec),
49 .max_entries = 2,
50 /* TODO: have entries for all possible errno's */
51};
52
53/* Used by trace point */
54struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = {
55 .type = BPF_MAP_TYPE_PERCPU_ARRAY,
56 .key_size = sizeof(u32),
57 .value_size = sizeof(struct datarec),
58 .max_entries = MAX_CPUS,
59};
60
61/* Used by trace point */
62struct bpf_map_def SEC("maps") cpumap_kthread_cnt = {
63 .type = BPF_MAP_TYPE_PERCPU_ARRAY,
64 .key_size = sizeof(u32),
65 .value_size = sizeof(struct datarec),
66 .max_entries = 1,
67};
68
69/* Set of maps controlling available CPU, and for iterating through
70 * selectable redirect CPUs.
71 */
72struct bpf_map_def SEC("maps") cpus_available = {
73 .type = BPF_MAP_TYPE_ARRAY,
74 .key_size = sizeof(u32),
75 .value_size = sizeof(u32),
76 .max_entries = MAX_CPUS,
77};
78struct bpf_map_def SEC("maps") cpus_count = {
79 .type = BPF_MAP_TYPE_ARRAY,
80 .key_size = sizeof(u32),
81 .value_size = sizeof(u32),
82 .max_entries = 1,
83};
84struct bpf_map_def SEC("maps") cpus_iterator = {
85 .type = BPF_MAP_TYPE_PERCPU_ARRAY,
86 .key_size = sizeof(u32),
87 .value_size = sizeof(u32),
88 .max_entries = 1,
89};
90
91/* Used by trace point */
92struct bpf_map_def SEC("maps") exception_cnt = {
93 .type = BPF_MAP_TYPE_PERCPU_ARRAY,
94 .key_size = sizeof(u32),
95 .value_size = sizeof(struct datarec),
96 .max_entries = 1,
97};
98
99/* Helper parse functions */
100
101/* Parse Ethernet layer 2, extract network layer 3 offset and protocol
102 *
103 * Returns false on error and non-supported ether-type
104 */
105struct vlan_hdr {
106 __be16 h_vlan_TCI;
107 __be16 h_vlan_encapsulated_proto;
108};
109
110static __always_inline
111bool parse_eth(struct ethhdr *eth, void *data_end,
112 u16 *eth_proto, u64 *l3_offset)
113{
114 u16 eth_type;
115 u64 offset;
116
117 offset = sizeof(*eth);
118 if ((void *)eth + offset > data_end)
119 return false;
120
121 eth_type = eth->h_proto;
122
123 /* Skip non 802.3 Ethertypes */
124 if (unlikely(ntohs(eth_type) < ETH_P_802_3_MIN))
125 return false;
126
127 /* Handle VLAN tagged packet */
128 if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) {
129 struct vlan_hdr *vlan_hdr;
130
131 vlan_hdr = (void *)eth + offset;
132 offset += sizeof(*vlan_hdr);
133 if ((void *)eth + offset > data_end)
134 return false;
135 eth_type = vlan_hdr->h_vlan_encapsulated_proto;
136 }
137 /* TODO: Handle double VLAN tagged packet */
138
139 *eth_proto = ntohs(eth_type);
140 *l3_offset = offset;
141 return true;
142}
143
144static __always_inline
145u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off)
146{
147 void *data_end = (void *)(long)ctx->data_end;
148 void *data = (void *)(long)ctx->data;
149 struct iphdr *iph = data + nh_off;
150 struct udphdr *udph;
151 u16 dport;
152
153 if (iph + 1 > data_end)
154 return 0;
155 if (!(iph->protocol == IPPROTO_UDP))
156 return 0;
157
158 udph = (void *)(iph + 1);
159 if (udph + 1 > data_end)
160 return 0;
161
162 dport = ntohs(udph->dest);
163 return dport;
164}
165
166static __always_inline
167int get_proto_ipv4(struct xdp_md *ctx, u64 nh_off)
168{
169 void *data_end = (void *)(long)ctx->data_end;
170 void *data = (void *)(long)ctx->data;
171 struct iphdr *iph = data + nh_off;
172
173 if (iph + 1 > data_end)
174 return 0;
175 return iph->protocol;
176}
177
178static __always_inline
179int get_proto_ipv6(struct xdp_md *ctx, u64 nh_off)
180{
181 void *data_end = (void *)(long)ctx->data_end;
182 void *data = (void *)(long)ctx->data;
183 struct ipv6hdr *ip6h = data + nh_off;
184
185 if (ip6h + 1 > data_end)
186 return 0;
187 return ip6h->nexthdr;
188}
189
190SEC("xdp_cpu_map0")
191int xdp_prognum0_no_touch(struct xdp_md *ctx)
192{
193 void *data_end = (void *)(long)ctx->data_end;
194 void *data = (void *)(long)ctx->data;
195 struct datarec *rec;
196 u32 *cpu_selected;
197 u32 cpu_dest;
198 u32 key = 0;
199
200 /* Only use first entry in cpus_available */
201 cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
202 if (!cpu_selected)
203 return XDP_ABORTED;
204 cpu_dest = *cpu_selected;
205
206 /* Count RX packet in map */
207 rec = bpf_map_lookup_elem(&rx_cnt, &key);
208 if (!rec)
209 return XDP_ABORTED;
210 rec->processed++;
211
212 if (cpu_dest >= MAX_CPUS) {
213 rec->issue++;
214 return XDP_ABORTED;
215 }
216
217 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
218}
219
220SEC("xdp_cpu_map1_touch_data")
221int xdp_prognum1_touch_data(struct xdp_md *ctx)
222{
223 void *data_end = (void *)(long)ctx->data_end;
224 void *data = (void *)(long)ctx->data;
225 struct ethhdr *eth = data;
226 struct datarec *rec;
227 u32 *cpu_selected;
228 u32 cpu_dest;
229 u16 eth_type;
230 u32 key = 0;
231
232 /* Only use first entry in cpus_available */
233 cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
234 if (!cpu_selected)
235 return XDP_ABORTED;
236 cpu_dest = *cpu_selected;
237
238 /* Validate packet length is minimum Eth header size */
239 if (eth + 1 > data_end)
240 return XDP_ABORTED;
241
242 /* Count RX packet in map */
243 rec = bpf_map_lookup_elem(&rx_cnt, &key);
244 if (!rec)
245 return XDP_ABORTED;
246 rec->processed++;
247
248 /* Read packet data, and use it (drop non 802.3 Ethertypes) */
249 eth_type = eth->h_proto;
250 if (ntohs(eth_type) < ETH_P_802_3_MIN) {
251 rec->dropped++;
252 return XDP_DROP;
253 }
254
255 if (cpu_dest >= MAX_CPUS) {
256 rec->issue++;
257 return XDP_ABORTED;
258 }
259
260 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
261}
262
263SEC("xdp_cpu_map2_round_robin")
264int xdp_prognum2_round_robin(struct xdp_md *ctx)
265{
266 void *data_end = (void *)(long)ctx->data_end;
267 void *data = (void *)(long)ctx->data;
268 struct ethhdr *eth = data;
269 struct datarec *rec;
270 u32 cpu_dest;
271 u32 *cpu_lookup;
272 u32 key0 = 0;
273
274 u32 *cpu_selected;
275 u32 *cpu_iterator;
276 u32 *cpu_max;
277 u32 cpu_idx;
278
279 cpu_max = bpf_map_lookup_elem(&cpus_count, &key0);
280 if (!cpu_max)
281 return XDP_ABORTED;
282
283 cpu_iterator = bpf_map_lookup_elem(&cpus_iterator, &key0);
284 if (!cpu_iterator)
285 return XDP_ABORTED;
286 cpu_idx = *cpu_iterator;
287
288 *cpu_iterator += 1;
289 if (*cpu_iterator == *cpu_max)
290 *cpu_iterator = 0;
291
292 cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
293 if (!cpu_selected)
294 return XDP_ABORTED;
295 cpu_dest = *cpu_selected;
296
297 /* Count RX packet in map */
298 rec = bpf_map_lookup_elem(&rx_cnt, &key0);
299 if (!rec)
300 return XDP_ABORTED;
301 rec->processed++;
302
303 if (cpu_dest >= MAX_CPUS) {
304 rec->issue++;
305 return XDP_ABORTED;
306 }
307
308 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
309}
310
311SEC("xdp_cpu_map3_proto_separate")
312int xdp_prognum3_proto_separate(struct xdp_md *ctx)
313{
314 void *data_end = (void *)(long)ctx->data_end;
315 void *data = (void *)(long)ctx->data;
316 struct ethhdr *eth = data;
317 u8 ip_proto = IPPROTO_UDP;
318 struct datarec *rec;
319 u16 eth_proto = 0;
320 u64 l3_offset = 0;
321 u32 cpu_dest = 0;
322 u32 cpu_idx = 0;
323 u32 *cpu_lookup;
324 u32 key = 0;
325
326 /* Count RX packet in map */
327 rec = bpf_map_lookup_elem(&rx_cnt, &key);
328 if (!rec)
329 return XDP_ABORTED;
330 rec->processed++;
331
332 if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
333 return XDP_PASS; /* Just skip */
334
335 /* Extract L4 protocol */
336 switch (eth_proto) {
337 case ETH_P_IP:
338 ip_proto = get_proto_ipv4(ctx, l3_offset);
339 break;
340 case ETH_P_IPV6:
341 ip_proto = get_proto_ipv6(ctx, l3_offset);
342 break;
343 case ETH_P_ARP:
344 cpu_idx = 0; /* ARP packet handled on separate CPU */
345 break;
346 default:
347 cpu_idx = 0;
348 }
349
350 /* Choose CPU based on L4 protocol */
351 switch (ip_proto) {
352 case IPPROTO_ICMP:
353 case IPPROTO_ICMPV6:
354 cpu_idx = 2;
355 break;
356 case IPPROTO_TCP:
357 cpu_idx = 0;
358 break;
359 case IPPROTO_UDP:
360 cpu_idx = 1;
361 break;
362 default:
363 cpu_idx = 0;
364 }
365
366 cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
367 if (!cpu_lookup)
368 return XDP_ABORTED;
369 cpu_dest = *cpu_lookup;
370
371 if (cpu_dest >= MAX_CPUS) {
372 rec->issue++;
373 return XDP_ABORTED;
374 }
375
376 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
377}
378
379SEC("xdp_cpu_map4_ddos_filter_pktgen")
380int xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx)
381{
382 void *data_end = (void *)(long)ctx->data_end;
383 void *data = (void *)(long)ctx->data;
384 struct ethhdr *eth = data;
385 u8 ip_proto = IPPROTO_UDP;
386 struct datarec *rec;
387 u16 eth_proto = 0;
388 u64 l3_offset = 0;
389 u32 cpu_dest = 0;
390 u32 cpu_idx = 0;
391 u16 dest_port;
392 u32 *cpu_lookup;
393 u32 key = 0;
394
395 /* Count RX packet in map */
396 rec = bpf_map_lookup_elem(&rx_cnt, &key);
397 if (!rec)
398 return XDP_ABORTED;
399 rec->processed++;
400
401 if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
402 return XDP_PASS; /* Just skip */
403
404 /* Extract L4 protocol */
405 switch (eth_proto) {
406 case ETH_P_IP:
407 ip_proto = get_proto_ipv4(ctx, l3_offset);
408 break;
409 case ETH_P_IPV6:
410 ip_proto = get_proto_ipv6(ctx, l3_offset);
411 break;
412 case ETH_P_ARP:
413 cpu_idx = 0; /* ARP packet handled on separate CPU */
414 break;
415 default:
416 cpu_idx = 0;
417 }
418
419 /* Choose CPU based on L4 protocol */
420 switch (ip_proto) {
421 case IPPROTO_ICMP:
422 case IPPROTO_ICMPV6:
423 cpu_idx = 2;
424 break;
425 case IPPROTO_TCP:
426 cpu_idx = 0;
427 break;
428 case IPPROTO_UDP:
429 cpu_idx = 1;
430 /* DDoS filter UDP port 9 (pktgen) */
431 dest_port = get_dest_port_ipv4_udp(ctx, l3_offset);
432 if (dest_port == 9) {
433 if (rec)
434 rec->dropped++;
435 return XDP_DROP;
436 }
437 break;
438 default:
439 cpu_idx = 0;
440 }
441
442 cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
443 if (!cpu_lookup)
444 return XDP_ABORTED;
445 cpu_dest = *cpu_lookup;
446
447 if (cpu_dest >= MAX_CPUS) {
448 rec->issue++;
449 return XDP_ABORTED;
450 }
451
452 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
453}
454
455
456char _license[] SEC("license") = "GPL";
457
458/*** Trace point code ***/
459
460/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
461 * Code in: kernel/include/trace/events/xdp.h
462 */
463struct xdp_redirect_ctx {
464 u64 __pad; // First 8 bytes are not accessible by bpf code
465 int prog_id; // offset:8; size:4; signed:1;
466 u32 act; // offset:12 size:4; signed:0;
467 int ifindex; // offset:16 size:4; signed:1;
468 int err; // offset:20 size:4; signed:1;
469 int to_ifindex; // offset:24 size:4; signed:1;
470 u32 map_id; // offset:28 size:4; signed:0;
471 int map_index; // offset:32 size:4; signed:1;
472}; // offset:36
473
474enum {
475 XDP_REDIRECT_SUCCESS = 0,
476 XDP_REDIRECT_ERROR = 1
477};
478
479static __always_inline
480int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx)
481{
482 u32 key = XDP_REDIRECT_ERROR;
483 struct datarec *rec;
484 int err = ctx->err;
485
486 if (!err)
487 key = XDP_REDIRECT_SUCCESS;
488
489 rec = bpf_map_lookup_elem(&redirect_err_cnt, &key);
490 if (!rec)
491 return 0;
492 rec->dropped += 1;
493
494 return 0; /* Indicate event was filtered (no further processing)*/
495 /*
496 * Returning 1 here would allow e.g. a perf-record tracepoint
497 * to see and record these events, but it doesn't work well
498 * in-practice as stopping perf-record also unload this
499 * bpf_prog. Plus, there is additional overhead of doing so.
500 */
501}
502
503SEC("tracepoint/xdp/xdp_redirect_err")
504int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx)
505{
506 return xdp_redirect_collect_stat(ctx);
507}
508
509SEC("tracepoint/xdp/xdp_redirect_map_err")
510int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx)
511{
512 return xdp_redirect_collect_stat(ctx);
513}
514
515/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format
516 * Code in: kernel/include/trace/events/xdp.h
517 */
518struct xdp_exception_ctx {
519 u64 __pad; // First 8 bytes are not accessible by bpf code
520 int prog_id; // offset:8; size:4; signed:1;
521 u32 act; // offset:12; size:4; signed:0;
522 int ifindex; // offset:16; size:4; signed:1;
523};
524
525SEC("tracepoint/xdp/xdp_exception")
526int trace_xdp_exception(struct xdp_exception_ctx *ctx)
527{
528 struct datarec *rec;
529 u32 key = 0;
530
531 rec = bpf_map_lookup_elem(&exception_cnt, &key);
532 if (!rec)
533 return 1;
534 rec->dropped += 1;
535
536 return 0;
537}
538
539/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
540 * Code in: kernel/include/trace/events/xdp.h
541 */
542struct cpumap_enqueue_ctx {
543 u64 __pad; // First 8 bytes are not accessible by bpf code
544 int map_id; // offset:8; size:4; signed:1;
545 u32 act; // offset:12; size:4; signed:0;
546 int cpu; // offset:16; size:4; signed:1;
547 unsigned int drops; // offset:20; size:4; signed:0;
548 unsigned int processed; // offset:24; size:4; signed:0;
549 int to_cpu; // offset:28; size:4; signed:1;
550};
551
552SEC("tracepoint/xdp/xdp_cpumap_enqueue")
553int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx)
554{
555 u32 to_cpu = ctx->to_cpu;
556 struct datarec *rec;
557
558 if (to_cpu >= MAX_CPUS)
559 return 1;
560
561 rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu);
562 if (!rec)
563 return 0;
564 rec->processed += ctx->processed;
565 rec->dropped += ctx->drops;
566
567 /* Record bulk events, then userspace can calc average bulk size */
568 if (ctx->processed > 0)
569 rec->issue += 1;
570
571 /* Inception: It's possible to detect overload situations, via
572 * this tracepoint. This can be used for creating a feedback
573 * loop to XDP, which can take appropriate actions to mitigate
574 * this overload situation.
575 */
576 return 0;
577}
578
579/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format
580 * Code in: kernel/include/trace/events/xdp.h
581 */
582struct cpumap_kthread_ctx {
583 u64 __pad; // First 8 bytes are not accessible by bpf code
584 int map_id; // offset:8; size:4; signed:1;
585 u32 act; // offset:12; size:4; signed:0;
586 int cpu; // offset:16; size:4; signed:1;
587 unsigned int drops; // offset:20; size:4; signed:0;
588 unsigned int processed; // offset:24; size:4; signed:0;
589 int sched; // offset:28; size:4; signed:1;
590};
591
592SEC("tracepoint/xdp/xdp_cpumap_kthread")
593int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
594{
595 struct datarec *rec;
596 u32 key = 0;
597
598 rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key);
599 if (!rec)
600 return 0;
601 rec->processed += ctx->processed;
602 rec->dropped += ctx->drops;
603
604 /* Count times kthread yielded CPU via schedule call */
605 if (ctx->sched)
606 rec->issue++;
607
608 return 0;
609}