samples/bpf/xdp_redirect_cpu_kern.c

   1 /*  XDP redirect to CPUs via cpumap (BPF_MAP_TYPE_CPUMAP)
   2  *
   3  *  GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
   4  */
   5 #include <uapi/linux/if_ether.h>
   6 #include <uapi/linux/if_packet.h>
   7 #include <uapi/linux/if_vlan.h>
   8 #include <uapi/linux/ip.h>
   9 #include <uapi/linux/ipv6.h>
  10 #include <uapi/linux/in.h>
  11 #include <uapi/linux/tcp.h>
  12 #include <uapi/linux/udp.h>
  13
  14 #include <uapi/linux/bpf.h>
  15 #include "bpf_helpers.h"
  16
  17 #define MAX_CPUS 12 /* WARNING - sync with _user.c */
  18
  19 /* Special map type that can XDP_REDIRECT frames to another CPU */
  20 struct bpf_map_def SEC("maps") cpu_map = {
  21         .type           = BPF_MAP_TYPE_CPUMAP,
  22         .key_size       = sizeof(u32),
  23         .value_size     = sizeof(u32),
  24         .max_entries    = MAX_CPUS,
  25 };
  26
  27 /* Common stats data record to keep userspace more simple */
  28 struct datarec {
  29         __u64 processed;
  30         __u64 dropped;
  31         __u64 issue;
  32 };
  33
  34 /* Count RX packets, as XDP bpf_prog doesn't get direct TX-success
  35  * feedback.  Redirect TX errors can be caught via a tracepoint.
  36  */
  37 struct bpf_map_def SEC("maps") rx_cnt = {
  38         .type           = BPF_MAP_TYPE_PERCPU_ARRAY,
  39         .key_size       = sizeof(u32),
  40         .value_size     = sizeof(struct datarec),
  41         .max_entries    = 1,
  42 };
  43
  44 /* Used by trace point */
  45 struct bpf_map_def SEC("maps") redirect_err_cnt = {
  46         .type           = BPF_MAP_TYPE_PERCPU_ARRAY,
  47         .key_size       = sizeof(u32),
  48         .value_size     = sizeof(struct datarec),
  49         .max_entries    = 2,
  50         /* TODO: have entries for all possible errno's */
  51 };
  52
  53 /* Used by trace point */
  54 struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = {
  55         .type           = BPF_MAP_TYPE_PERCPU_ARRAY,
  56         .key_size       = sizeof(u32),
  57         .value_size     = sizeof(struct datarec),
  58         .max_entries    = MAX_CPUS,
  59 };
  60
  61 /* Used by trace point */
  62 struct bpf_map_def SEC("maps") cpumap_kthread_cnt = {
  63         .type           = BPF_MAP_TYPE_PERCPU_ARRAY,
  64         .key_size       = sizeof(u32),
  65         .value_size     = sizeof(struct datarec),
  66         .max_entries    = 1,
  67 };
  68
  69 /* Set of maps controlling available CPU, and for iterating through
  70  * selectable redirect CPUs.
  71  */
  72 struct bpf_map_def SEC("maps") cpus_available = {
  73         .type           = BPF_MAP_TYPE_ARRAY,
  74         .key_size       = sizeof(u32),
  75         .value_size     = sizeof(u32),
  76         .max_entries    = MAX_CPUS,
  77 };
  78 struct bpf_map_def SEC("maps") cpus_count = {
  79         .type           = BPF_MAP_TYPE_ARRAY,
  80         .key_size       = sizeof(u32),
  81         .value_size     = sizeof(u32),
  82         .max_entries    = 1,
  83 };
  84 struct bpf_map_def SEC("maps") cpus_iterator = {
  85         .type           = BPF_MAP_TYPE_PERCPU_ARRAY,
  86         .key_size       = sizeof(u32),
  87         .value_size     = sizeof(u32),
  88         .max_entries    = 1,
  89 };
  90
  91 /* Used by trace point */
  92 struct bpf_map_def SEC("maps") exception_cnt = {
  93         .type           = BPF_MAP_TYPE_PERCPU_ARRAY,
  94         .key_size       = sizeof(u32),
  95         .value_size     = sizeof(struct datarec),
  96         .max_entries    = 1,
  97 };
  98
  99 /* Helper parse functions */
 100
 101 /* Parse Ethernet layer 2, extract network layer 3 offset and protocol
 102  *
 103  * Returns false on error and non-supported ether-type
 104  */
 105 struct vlan_hdr {
 106         __be16 h_vlan_TCI;
 107         __be16 h_vlan_encapsulated_proto;
 108 };
 109
 110 static __always_inline
 111 bool parse_eth(struct ethhdr *eth, void *data_end,
 112                u16 *eth_proto, u64 *l3_offset)
 113 {
 114         u16 eth_type;
 115         u64 offset;
 116
 117         offset = sizeof(*eth);
 118         if ((void *)eth + offset > data_end)
 119                 return false;
 120
 121         eth_type = eth->h_proto;
 122
 123         /* Skip non 802.3 Ethertypes */
 124         if (unlikely(ntohs(eth_type) < ETH_P_802_3_MIN))
 125                 return false;
 126
 127         /* Handle VLAN tagged packet */
 128         if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) {
 129                 struct vlan_hdr *vlan_hdr;
 130
 131                 vlan_hdr = (void *)eth + offset;
 132                 offset += sizeof(*vlan_hdr);
 133                 if ((void *)eth + offset > data_end)
 134                         return false;
 135                 eth_type = vlan_hdr->h_vlan_encapsulated_proto;
 136         }
 137         /* TODO: Handle double VLAN tagged packet */
 138
 139         *eth_proto = ntohs(eth_type);
 140         *l3_offset = offset;
 141         return true;
 142 }
 143
 144 static __always_inline
 145 u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off)
 146 {
 147         void *data_end = (void *)(long)ctx->data_end;
 148         void *data     = (void *)(long)ctx->data;
 149         struct iphdr *iph = data + nh_off;
 150         struct udphdr *udph;
 151         u16 dport;
 152
 153         if (iph + 1 > data_end)
 154                 return 0;
 155         if (!(iph->protocol == IPPROTO_UDP))
 156                 return 0;
 157
 158         udph = (void *)(iph + 1);
 159         if (udph + 1 > data_end)
 160                 return 0;
 161
 162         dport = ntohs(udph->dest);
 163         return dport;
 164 }
 165
 166 static __always_inline
 167 int get_proto_ipv4(struct xdp_md *ctx, u64 nh_off)
 168 {
 169         void *data_end = (void *)(long)ctx->data_end;
 170         void *data     = (void *)(long)ctx->data;
 171         struct iphdr *iph = data + nh_off;
 172
 173         if (iph + 1 > data_end)
 174                 return 0;
 175         return iph->protocol;
 176 }
 177
 178 static __always_inline
 179 int get_proto_ipv6(struct xdp_md *ctx, u64 nh_off)
 180 {
 181         void *data_end = (void *)(long)ctx->data_end;
 182         void *data     = (void *)(long)ctx->data;
 183         struct ipv6hdr *ip6h = data + nh_off;
 184
 185         if (ip6h + 1 > data_end)
 186                 return 0;
 187         return ip6h->nexthdr;
 188 }
 189
 190 SEC("xdp_cpu_map0")
 191 int  xdp_prognum0_no_touch(struct xdp_md *ctx)
 192 {
 193         void *data_end = (void *)(long)ctx->data_end;
 194         void *data     = (void *)(long)ctx->data;
 195         struct datarec *rec;
 196         u32 *cpu_selected;
 197         u32 cpu_dest;
 198         u32 key = 0;
 199
 200         /* Only use first entry in cpus_available */
 201         cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
 202         if (!cpu_selected)
 203                 return XDP_ABORTED;
 204         cpu_dest = *cpu_selected;
 205
 206         /* Count RX packet in map */
 207         rec = bpf_map_lookup_elem(&rx_cnt, &key);
 208         if (!rec)
 209                 return XDP_ABORTED;
 210         rec->processed++;
 211
 212         if (cpu_dest >= MAX_CPUS) {
 213                 rec->issue++;
 214                 return XDP_ABORTED;
 215         }
 216
 217         return bpf_redirect_map(&cpu_map, cpu_dest, 0);
 218 }
 219
 220 SEC("xdp_cpu_map1_touch_data")
 221 int  xdp_prognum1_touch_data(struct xdp_md *ctx)
 222 {
 223         void *data_end = (void *)(long)ctx->data_end;
 224         void *data     = (void *)(long)ctx->data;
 225         struct ethhdr *eth = data;
 226         struct datarec *rec;
 227         u32 *cpu_selected;
 228         u32 cpu_dest;
 229         u16 eth_type;
 230         u32 key = 0;
 231
 232         /* Only use first entry in cpus_available */
 233         cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
 234         if (!cpu_selected)
 235                 return XDP_ABORTED;
 236         cpu_dest = *cpu_selected;
 237
 238         /* Validate packet length is minimum Eth header size */
 239         if (eth + 1 > data_end)
 240                 return XDP_ABORTED;
 241
 242         /* Count RX packet in map */
 243         rec = bpf_map_lookup_elem(&rx_cnt, &key);
 244         if (!rec)
 245                 return XDP_ABORTED;
 246         rec->processed++;
 247
 248         /* Read packet data, and use it (drop non 802.3 Ethertypes) */
 249         eth_type = eth->h_proto;
 250         if (ntohs(eth_type) < ETH_P_802_3_MIN) {
 251                 rec->dropped++;
 252                 return XDP_DROP;
 253         }
 254
 255         if (cpu_dest >= MAX_CPUS) {
 256                 rec->issue++;
 257                 return XDP_ABORTED;
 258         }
 259
 260         return bpf_redirect_map(&cpu_map, cpu_dest, 0);
 261 }
 262
 263 SEC("xdp_cpu_map2_round_robin")
 264 int  xdp_prognum2_round_robin(struct xdp_md *ctx)
 265 {
 266         void *data_end = (void *)(long)ctx->data_end;
 267         void *data     = (void *)(long)ctx->data;
 268         struct ethhdr *eth = data;
 269         struct datarec *rec;
 270         u32 cpu_dest;
 271         u32 *cpu_lookup;
 272         u32 key0 = 0;
 273
 274         u32 *cpu_selected;
 275         u32 *cpu_iterator;
 276         u32 *cpu_max;
 277         u32 cpu_idx;
 278
 279         cpu_max = bpf_map_lookup_elem(&cpus_count, &key0);
 280         if (!cpu_max)
 281                 return XDP_ABORTED;
 282
 283         cpu_iterator = bpf_map_lookup_elem(&cpus_iterator, &key0);
 284         if (!cpu_iterator)
 285                 return XDP_ABORTED;
 286         cpu_idx = *cpu_iterator;
 287
 288         *cpu_iterator += 1;
 289         if (*cpu_iterator == *cpu_max)
 290                 *cpu_iterator = 0;
 291
 292         cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
 293         if (!cpu_selected)
 294                 return XDP_ABORTED;
 295         cpu_dest = *cpu_selected;
 296
 297         /* Count RX packet in map */
 298         rec = bpf_map_lookup_elem(&rx_cnt, &key0);
 299         if (!rec)
 300                 return XDP_ABORTED;
 301         rec->processed++;
 302
 303         if (cpu_dest >= MAX_CPUS) {
 304                 rec->issue++;
 305                 return XDP_ABORTED;
 306         }
 307
 308         return bpf_redirect_map(&cpu_map, cpu_dest, 0);
 309 }
 310
 311 SEC("xdp_cpu_map3_proto_separate")
 312 int  xdp_prognum3_proto_separate(struct xdp_md *ctx)
 313 {
 314         void *data_end = (void *)(long)ctx->data_end;
 315         void *data     = (void *)(long)ctx->data;
 316         struct ethhdr *eth = data;
 317         u8 ip_proto = IPPROTO_UDP;
 318         struct datarec *rec;
 319         u16 eth_proto = 0;
 320         u64 l3_offset = 0;
 321         u32 cpu_dest = 0;
 322         u32 cpu_idx = 0;
 323         u32 *cpu_lookup;
 324         u32 key = 0;
 325
 326         /* Count RX packet in map */
 327         rec = bpf_map_lookup_elem(&rx_cnt, &key);
 328         if (!rec)
 329                 return XDP_ABORTED;
 330         rec->processed++;
 331
 332         if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
 333                 return XDP_PASS; /* Just skip */
 334
 335         /* Extract L4 protocol */
 336         switch (eth_proto) {
 337         case ETH_P_IP:
 338                 ip_proto = get_proto_ipv4(ctx, l3_offset);
 339                 break;
 340         case ETH_P_IPV6:
 341                 ip_proto = get_proto_ipv6(ctx, l3_offset);
 342                 break;
 343         case ETH_P_ARP:
 344                 cpu_idx = 0; /* ARP packet handled on separate CPU */
 345                 break;
 346         default:
 347                 cpu_idx = 0;
 348         }
 349
 350         /* Choose CPU based on L4 protocol */
 351         switch (ip_proto) {
 352         case IPPROTO_ICMP:
 353         case IPPROTO_ICMPV6:
 354                 cpu_idx = 2;
 355                 break;
 356         case IPPROTO_TCP:
 357                 cpu_idx = 0;
 358                 break;
 359         case IPPROTO_UDP:
 360                 cpu_idx = 1;
 361                 break;
 362         default:
 363                 cpu_idx = 0;
 364         }
 365
 366         cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
 367         if (!cpu_lookup)
 368                 return XDP_ABORTED;
 369         cpu_dest = *cpu_lookup;
 370
 371         if (cpu_dest >= MAX_CPUS) {
 372                 rec->issue++;
 373                 return XDP_ABORTED;
 374         }
 375
 376         return bpf_redirect_map(&cpu_map, cpu_dest, 0);
 377 }
 378
 379 SEC("xdp_cpu_map4_ddos_filter_pktgen")
 380 int  xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx)
 381 {
 382         void *data_end = (void *)(long)ctx->data_end;
 383         void *data     = (void *)(long)ctx->data;
 384         struct ethhdr *eth = data;
 385         u8 ip_proto = IPPROTO_UDP;
 386         struct datarec *rec;
 387         u16 eth_proto = 0;
 388         u64 l3_offset = 0;
 389         u32 cpu_dest = 0;
 390         u32 cpu_idx = 0;
 391         u16 dest_port;
 392         u32 *cpu_lookup;
 393         u32 key = 0;
 394
 395         /* Count RX packet in map */
 396         rec = bpf_map_lookup_elem(&rx_cnt, &key);
 397         if (!rec)
 398                 return XDP_ABORTED;
 399         rec->processed++;
 400
 401         if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
 402                 return XDP_PASS; /* Just skip */
 403
 404         /* Extract L4 protocol */
 405         switch (eth_proto) {
 406         case ETH_P_IP:
 407                 ip_proto = get_proto_ipv4(ctx, l3_offset);
 408                 break;
 409         case ETH_P_IPV6:
 410                 ip_proto = get_proto_ipv6(ctx, l3_offset);
 411                 break;
 412         case ETH_P_ARP:
 413                 cpu_idx = 0; /* ARP packet handled on separate CPU */
 414                 break;
 415         default:
 416                 cpu_idx = 0;
 417         }
 418
 419         /* Choose CPU based on L4 protocol */
 420         switch (ip_proto) {
 421         case IPPROTO_ICMP:
 422         case IPPROTO_ICMPV6:
 423                 cpu_idx = 2;
 424                 break;
 425         case IPPROTO_TCP:
 426                 cpu_idx = 0;
 427                 break;
 428         case IPPROTO_UDP:
 429                 cpu_idx = 1;
 430                 /* DDoS filter UDP port 9 (pktgen) */
 431                 dest_port = get_dest_port_ipv4_udp(ctx, l3_offset);
 432                 if (dest_port == 9) {
 433                         if (rec)
 434                                 rec->dropped++;
 435                         return XDP_DROP;
 436                 }
 437                 break;
 438         default:
 439                 cpu_idx = 0;
 440         }
 441
 442         cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
 443         if (!cpu_lookup)
 444                 return XDP_ABORTED;
 445         cpu_dest = *cpu_lookup;
 446
 447         if (cpu_dest >= MAX_CPUS) {
 448                 rec->issue++;
 449                 return XDP_ABORTED;
 450         }
 451
 452         return bpf_redirect_map(&cpu_map, cpu_dest, 0);
 453 }
 454
 455
 456 char _license[] SEC("license") = "GPL";
 457
 458 /*** Trace point code ***/
 459
 460 /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
 461  * Code in:                kernel/include/trace/events/xdp.h
 462  */
 463 struct xdp_redirect_ctx {
 464         u64 __pad;      // First 8 bytes are not accessible by bpf code
 465         int prog_id;    //      offset:8;  size:4; signed:1;
 466         u32 act;        //      offset:12  size:4; signed:0;
 467         int ifindex;    //      offset:16  size:4; signed:1;
 468         int err;        //      offset:20  size:4; signed:1;
 469         int to_ifindex; //      offset:24  size:4; signed:1;
 470         u32 map_id;     //      offset:28  size:4; signed:0;
 471         int map_index;  //      offset:32  size:4; signed:1;
 472 };                      //      offset:36
 473
 474 enum {
 475         XDP_REDIRECT_SUCCESS = 0,
 476         XDP_REDIRECT_ERROR = 1
 477 };
 478
 479 static __always_inline
 480 int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx)
 481 {
 482         u32 key = XDP_REDIRECT_ERROR;
 483         struct datarec *rec;
 484         int err = ctx->err;
 485
 486         if (!err)
 487                 key = XDP_REDIRECT_SUCCESS;
 488
 489         rec = bpf_map_lookup_elem(&redirect_err_cnt, &key);
 490         if (!rec)
 491                 return 0;
 492         rec->dropped += 1;
 493
 494         return 0; /* Indicate event was filtered (no further processing)*/
 495         /*
 496          * Returning 1 here would allow e.g. a perf-record tracepoint
 497          * to see and record these events, but it doesn't work well
 498          * in-practice as stopping perf-record also unload this
 499          * bpf_prog.  Plus, there is additional overhead of doing so.
 500          */
 501 }
 502
 503 SEC("tracepoint/xdp/xdp_redirect_err")
 504 int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx)
 505 {
 506         return xdp_redirect_collect_stat(ctx);
 507 }
 508
 509 SEC("tracepoint/xdp/xdp_redirect_map_err")
 510 int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx)
 511 {
 512         return xdp_redirect_collect_stat(ctx);
 513 }
 514
 515 /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format
 516  * Code in:                kernel/include/trace/events/xdp.h
 517  */
 518 struct xdp_exception_ctx {
 519         u64 __pad;      // First 8 bytes are not accessible by bpf code
 520         int prog_id;    //      offset:8;  size:4; signed:1;
 521         u32 act;        //      offset:12; size:4; signed:0;
 522         int ifindex;    //      offset:16; size:4; signed:1;
 523 };
 524
 525 SEC("tracepoint/xdp/xdp_exception")
 526 int trace_xdp_exception(struct xdp_exception_ctx *ctx)
 527 {
 528         struct datarec *rec;
 529         u32 key = 0;
 530
 531         rec = bpf_map_lookup_elem(&exception_cnt, &key);
 532         if (!rec)
 533                 return 1;
 534         rec->dropped += 1;
 535
 536         return 0;
 537 }
 538
 539 /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
 540  * Code in:         kernel/include/trace/events/xdp.h
 541  */
 542 struct cpumap_enqueue_ctx {
 543         u64 __pad;              // First 8 bytes are not accessible by bpf code
 544         int map_id;             //      offset:8;  size:4; signed:1;
 545         u32 act;                //      offset:12; size:4; signed:0;
 546         int cpu;                //      offset:16; size:4; signed:1;
 547         unsigned int drops;     //      offset:20; size:4; signed:0;
 548         unsigned int processed; //      offset:24; size:4; signed:0;
 549         int to_cpu;             //      offset:28; size:4; signed:1;
 550 };
 551
 552 SEC("tracepoint/xdp/xdp_cpumap_enqueue")
 553 int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx)
 554 {
 555         u32 to_cpu = ctx->to_cpu;
 556         struct datarec *rec;
 557
 558         if (to_cpu >= MAX_CPUS)
 559                 return 1;
 560
 561         rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu);
 562         if (!rec)
 563                 return 0;
 564         rec->processed += ctx->processed;
 565         rec->dropped   += ctx->drops;
 566
 567         /* Record bulk events, then userspace can calc average bulk size */
 568         if (ctx->processed > 0)
 569                 rec->issue += 1;
 570
 571         /* Inception: It's possible to detect overload situations, via
 572          * this tracepoint.  This can be used for creating a feedback
 573          * loop to XDP, which can take appropriate actions to mitigate
 574          * this overload situation.
 575          */
 576         return 0;
 577 }
 578
 579 /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format
 580  * Code in:         kernel/include/trace/events/xdp.h
 581  */
 582 struct cpumap_kthread_ctx {
 583         u64 __pad;              // First 8 bytes are not accessible by bpf code
 584         int map_id;             //      offset:8;  size:4; signed:1;
 585         u32 act;                //      offset:12; size:4; signed:0;
 586         int cpu;                //      offset:16; size:4; signed:1;
 587         unsigned int drops;     //      offset:20; size:4; signed:0;
 588         unsigned int processed; //      offset:24; size:4; signed:0;
 589         int sched;              //      offset:28; size:4; signed:1;
 590 };
 591
 592 SEC("tracepoint/xdp/xdp_cpumap_kthread")
 593 int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
 594 {
 595         struct datarec *rec;
 596         u32 key = 0;
 597
 598         rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key);
 599         if (!rec)
 600                 return 0;
 601         rec->processed += ctx->processed;
 602         rec->dropped   += ctx->drops;
 603
 604         /* Count times kthread yielded CPU via schedule call */
 605         if (ctx->sched)
 606                 rec->issue++;
 607
 608         return 0;
 609 }