4 * Developed by Daynix Computing LTD (http://www.daynix.com)
7 * Andrew Melnychenko <andrew@daynix.com>
8 * Yuri Benditovich <yuri.benditovich@daynix.com>
10 * This work is licensed under the terms of the GNU GPL, version 2. See
11 * the COPYING file in the top-level directory.
14 * Requires llvm, clang, bpftool, linux kernel tree
16 * Build rss.bpf.skeleton.h:
17 * make -f Makefile.ebpf clean all
22 #include <linux/bpf.h>
25 #include <linux/if_ether.h>
27 #include <linux/ipv6.h>
29 #include <linux/udp.h>
30 #include <linux/tcp.h>
32 #include <bpf/bpf_helpers.h>
33 #include <bpf/bpf_endian.h>
34 #include <linux/virtio_net.h>
36 #define INDIRECTION_TABLE_SIZE 128
37 #define HASH_CALCULATION_BUFFER_SIZE 36
43 __u16 indirections_len
;
45 } __attribute__((packed
));
47 struct toeplitz_key_data_t
{
48 __u32 leftmost_32_bits
;
49 __u8 next_byte
[HASH_CALCULATION_BUFFER_SIZE
];
52 struct packet_hash_info_t
{
71 struct in6_addr in6_src
;
72 struct in6_addr in6_dst
;
73 struct in6_addr in6_ext_src
;
74 struct in6_addr in6_ext_dst
;
79 struct bpf_map_def
SEC("maps")
80 tap_rss_map_configurations
= {
81 .type
= BPF_MAP_TYPE_ARRAY
,
82 .key_size
= sizeof(__u32
),
83 .value_size
= sizeof(struct rss_config_t
),
87 struct bpf_map_def
SEC("maps")
88 tap_rss_map_toeplitz_key
= {
89 .type
= BPF_MAP_TYPE_ARRAY
,
90 .key_size
= sizeof(__u32
),
91 .value_size
= sizeof(struct toeplitz_key_data_t
),
95 struct bpf_map_def
SEC("maps")
96 tap_rss_map_indirection_table
= {
97 .type
= BPF_MAP_TYPE_ARRAY
,
98 .key_size
= sizeof(__u32
),
99 .value_size
= sizeof(__u16
),
100 .max_entries
= INDIRECTION_TABLE_SIZE
,
103 static inline void net_rx_rss_add_chunk(__u8
*rss_input
, size_t *bytes_written
,
104 const void *ptr
, size_t size
) {
105 __builtin_memcpy(&rss_input
[*bytes_written
], ptr
, size
);
106 *bytes_written
+= size
;
110 void net_toeplitz_add(__u32
*result
,
113 , struct toeplitz_key_data_t
*key
) {
115 __u32 accumulator
= *result
;
116 __u32 leftmost_32_bits
= key
->leftmost_32_bits
;
119 for (byte
= 0; byte
< HASH_CALCULATION_BUFFER_SIZE
; byte
++) {
120 __u8 input_byte
= input
[byte
];
121 __u8 key_byte
= key
->next_byte
[byte
];
124 for (bit
= 0; bit
< 8; bit
++) {
125 if (input_byte
& (1 << 7)) {
126 accumulator
^= leftmost_32_bits
;
130 (leftmost_32_bits
<< 1) | ((key_byte
& (1 << 7)) >> 7);
137 *result
= accumulator
;
141 static inline int ip6_extension_header_type(__u8 hdr_type
)
144 case IPPROTO_HOPOPTS
:
145 case IPPROTO_ROUTING
:
146 case IPPROTO_FRAGMENT
:
149 case IPPROTO_DSTOPTS
:
158 * https://www.iana.org/assignments/ipv6-parameters/ipv6-parameters.xhtml
159 * we expect that there are would be no more than 11 extensions in IPv6 header,
160 * also there is 27 TLV options for Destination and Hop-by-hop extensions.
161 * Need to choose reasonable amount of maximum extensions/options we may
162 * check to find ext src/dst.
164 #define IP6_EXTENSIONS_COUNT 11
165 #define IP6_OPTIONS_COUNT 30
167 static inline int parse_ipv6_ext(struct __sk_buff
*skb
,
168 struct packet_hash_info_t
*info
,
169 __u8
*l4_protocol
, size_t *l4_offset
)
173 if (!ip6_extension_header_type(*l4_protocol
)) {
177 struct ipv6_opt_hdr ext_hdr
= {};
179 for (unsigned int i
= 0; i
< IP6_EXTENSIONS_COUNT
; ++i
) {
181 err
= bpf_skb_load_bytes_relative(skb
, *l4_offset
, &ext_hdr
,
182 sizeof(ext_hdr
), BPF_HDR_START_NET
);
187 if (*l4_protocol
== IPPROTO_ROUTING
) {
188 struct ipv6_rt_hdr ext_rt
= {};
190 err
= bpf_skb_load_bytes_relative(skb
, *l4_offset
, &ext_rt
,
191 sizeof(ext_rt
), BPF_HDR_START_NET
);
196 if ((ext_rt
.type
== IPV6_SRCRT_TYPE_2
) &&
197 (ext_rt
.hdrlen
== sizeof(struct in6_addr
) / 8) &&
198 (ext_rt
.segments_left
== 1)) {
200 err
= bpf_skb_load_bytes_relative(skb
,
201 *l4_offset
+ offsetof(struct rt2_hdr
, addr
),
202 &info
->in6_ext_dst
, sizeof(info
->in6_ext_dst
),
208 info
->is_ipv6_ext_dst
= 1;
211 } else if (*l4_protocol
== IPPROTO_DSTOPTS
) {
215 } __attribute__((packed
)) opt
= {};
217 size_t opt_offset
= sizeof(ext_hdr
);
219 for (unsigned int j
= 0; j
< IP6_OPTIONS_COUNT
; ++j
) {
220 err
= bpf_skb_load_bytes_relative(skb
, *l4_offset
+ opt_offset
,
221 &opt
, sizeof(opt
), BPF_HDR_START_NET
);
226 if (opt
.type
== IPV6_TLV_HAO
) {
227 err
= bpf_skb_load_bytes_relative(skb
,
228 *l4_offset
+ opt_offset
229 + offsetof(struct ipv6_destopt_hao
, addr
),
230 &info
->in6_ext_src
, sizeof(info
->in6_ext_src
),
236 info
->is_ipv6_ext_src
= 1;
240 opt_offset
+= (opt
.type
== IPV6_TLV_PAD1
) ?
241 1 : opt
.length
+ sizeof(opt
);
243 if (opt_offset
+ 1 >= ext_hdr
.hdrlen
* 8) {
247 } else if (*l4_protocol
== IPPROTO_FRAGMENT
) {
248 info
->is_fragmented
= true;
251 *l4_protocol
= ext_hdr
.nexthdr
;
252 *l4_offset
+= (ext_hdr
.hdrlen
+ 1) * 8;
254 if (!ip6_extension_header_type(ext_hdr
.nexthdr
)) {
264 static __be16
parse_eth_type(struct __sk_buff
*skb
)
266 unsigned int offset
= 12;
270 err
= bpf_skb_load_bytes_relative(skb
, offset
, &ret
, sizeof(ret
),
276 switch (bpf_ntohs(ret
)) {
281 err
= bpf_skb_load_bytes_relative(skb
, offset
, &ret
, sizeof(ret
),
294 static inline int parse_packet(struct __sk_buff
*skb
,
295 struct packet_hash_info_t
*info
)
303 size_t l4_offset
= 0;
304 __u8 l4_protocol
= 0;
305 __u16 l3_protocol
= bpf_ntohs(parse_eth_type(skb
));
306 if (l3_protocol
== 0) {
311 if (l3_protocol
== ETH_P_IP
) {
314 struct iphdr ip
= {};
315 err
= bpf_skb_load_bytes_relative(skb
, 0, &ip
, sizeof(ip
),
321 info
->in_src
= ip
.saddr
;
322 info
->in_dst
= ip
.daddr
;
323 info
->is_fragmented
= !!ip
.frag_off
;
325 l4_protocol
= ip
.protocol
;
326 l4_offset
= ip
.ihl
* 4;
327 } else if (l3_protocol
== ETH_P_IPV6
) {
330 struct ipv6hdr ip6
= {};
331 err
= bpf_skb_load_bytes_relative(skb
, 0, &ip6
, sizeof(ip6
),
337 info
->in6_src
= ip6
.saddr
;
338 info
->in6_dst
= ip6
.daddr
;
340 l4_protocol
= ip6
.nexthdr
;
341 l4_offset
= sizeof(ip6
);
343 err
= parse_ipv6_ext(skb
, info
, &l4_protocol
, &l4_offset
);
349 if (l4_protocol
!= 0 && !info
->is_fragmented
) {
350 if (l4_protocol
== IPPROTO_TCP
) {
353 struct tcphdr tcp
= {};
354 err
= bpf_skb_load_bytes_relative(skb
, l4_offset
, &tcp
, sizeof(tcp
),
360 info
->src_port
= tcp
.source
;
361 info
->dst_port
= tcp
.dest
;
362 } else if (l4_protocol
== IPPROTO_UDP
) { /* TODO: add udplite? */
365 struct udphdr udp
= {};
366 err
= bpf_skb_load_bytes_relative(skb
, l4_offset
, &udp
, sizeof(udp
),
372 info
->src_port
= udp
.source
;
373 info
->dst_port
= udp
.dest
;
383 static inline __u32
calculate_rss_hash(struct __sk_buff
*skb
,
384 struct rss_config_t
*config
, struct toeplitz_key_data_t
*toe
)
386 __u8 rss_input
[HASH_CALCULATION_BUFFER_SIZE
] = {};
387 size_t bytes_written
= 0;
390 struct packet_hash_info_t packet_info
= {};
392 err
= parse_packet(skb
, &packet_info
);
397 if (packet_info
.is_ipv4
) {
398 if (packet_info
.is_tcp
&&
399 config
->hash_types
& VIRTIO_NET_RSS_HASH_TYPE_TCPv4
) {
401 net_rx_rss_add_chunk(rss_input
, &bytes_written
,
403 sizeof(packet_info
.in_src
));
404 net_rx_rss_add_chunk(rss_input
, &bytes_written
,
406 sizeof(packet_info
.in_dst
));
407 net_rx_rss_add_chunk(rss_input
, &bytes_written
,
408 &packet_info
.src_port
,
409 sizeof(packet_info
.src_port
));
410 net_rx_rss_add_chunk(rss_input
, &bytes_written
,
411 &packet_info
.dst_port
,
412 sizeof(packet_info
.dst_port
));
413 } else if (packet_info
.is_udp
&&
414 config
->hash_types
& VIRTIO_NET_RSS_HASH_TYPE_UDPv4
) {
416 net_rx_rss_add_chunk(rss_input
, &bytes_written
,
418 sizeof(packet_info
.in_src
));
419 net_rx_rss_add_chunk(rss_input
, &bytes_written
,
421 sizeof(packet_info
.in_dst
));
422 net_rx_rss_add_chunk(rss_input
, &bytes_written
,
423 &packet_info
.src_port
,
424 sizeof(packet_info
.src_port
));
425 net_rx_rss_add_chunk(rss_input
, &bytes_written
,
426 &packet_info
.dst_port
,
427 sizeof(packet_info
.dst_port
));
428 } else if (config
->hash_types
& VIRTIO_NET_RSS_HASH_TYPE_IPv4
) {
429 net_rx_rss_add_chunk(rss_input
, &bytes_written
,
431 sizeof(packet_info
.in_src
));
432 net_rx_rss_add_chunk(rss_input
, &bytes_written
,
434 sizeof(packet_info
.in_dst
));
436 } else if (packet_info
.is_ipv6
) {
437 if (packet_info
.is_tcp
&&
438 config
->hash_types
& VIRTIO_NET_RSS_HASH_TYPE_TCPv6
) {
440 if (packet_info
.is_ipv6_ext_src
&&
441 config
->hash_types
& VIRTIO_NET_RSS_HASH_TYPE_TCP_EX
) {
443 net_rx_rss_add_chunk(rss_input
, &bytes_written
,
444 &packet_info
.in6_ext_src
,
445 sizeof(packet_info
.in6_ext_src
));
447 net_rx_rss_add_chunk(rss_input
, &bytes_written
,
448 &packet_info
.in6_src
,
449 sizeof(packet_info
.in6_src
));
451 if (packet_info
.is_ipv6_ext_dst
&&
452 config
->hash_types
& VIRTIO_NET_RSS_HASH_TYPE_TCP_EX
) {
454 net_rx_rss_add_chunk(rss_input
, &bytes_written
,
455 &packet_info
.in6_ext_dst
,
456 sizeof(packet_info
.in6_ext_dst
));
458 net_rx_rss_add_chunk(rss_input
, &bytes_written
,
459 &packet_info
.in6_dst
,
460 sizeof(packet_info
.in6_dst
));
462 net_rx_rss_add_chunk(rss_input
, &bytes_written
,
463 &packet_info
.src_port
,
464 sizeof(packet_info
.src_port
));
465 net_rx_rss_add_chunk(rss_input
, &bytes_written
,
466 &packet_info
.dst_port
,
467 sizeof(packet_info
.dst_port
));
468 } else if (packet_info
.is_udp
&&
469 config
->hash_types
& VIRTIO_NET_RSS_HASH_TYPE_UDPv6
) {
471 if (packet_info
.is_ipv6_ext_src
&&
472 config
->hash_types
& VIRTIO_NET_RSS_HASH_TYPE_UDP_EX
) {
474 net_rx_rss_add_chunk(rss_input
, &bytes_written
,
475 &packet_info
.in6_ext_src
,
476 sizeof(packet_info
.in6_ext_src
));
478 net_rx_rss_add_chunk(rss_input
, &bytes_written
,
479 &packet_info
.in6_src
,
480 sizeof(packet_info
.in6_src
));
482 if (packet_info
.is_ipv6_ext_dst
&&
483 config
->hash_types
& VIRTIO_NET_RSS_HASH_TYPE_UDP_EX
) {
485 net_rx_rss_add_chunk(rss_input
, &bytes_written
,
486 &packet_info
.in6_ext_dst
,
487 sizeof(packet_info
.in6_ext_dst
));
489 net_rx_rss_add_chunk(rss_input
, &bytes_written
,
490 &packet_info
.in6_dst
,
491 sizeof(packet_info
.in6_dst
));
494 net_rx_rss_add_chunk(rss_input
, &bytes_written
,
495 &packet_info
.src_port
,
496 sizeof(packet_info
.src_port
));
497 net_rx_rss_add_chunk(rss_input
, &bytes_written
,
498 &packet_info
.dst_port
,
499 sizeof(packet_info
.dst_port
));
501 } else if (config
->hash_types
& VIRTIO_NET_RSS_HASH_TYPE_IPv6
) {
502 if (packet_info
.is_ipv6_ext_src
&&
503 config
->hash_types
& VIRTIO_NET_RSS_HASH_TYPE_IP_EX
) {
505 net_rx_rss_add_chunk(rss_input
, &bytes_written
,
506 &packet_info
.in6_ext_src
,
507 sizeof(packet_info
.in6_ext_src
));
509 net_rx_rss_add_chunk(rss_input
, &bytes_written
,
510 &packet_info
.in6_src
,
511 sizeof(packet_info
.in6_src
));
513 if (packet_info
.is_ipv6_ext_dst
&&
514 config
->hash_types
& VIRTIO_NET_RSS_HASH_TYPE_IP_EX
) {
516 net_rx_rss_add_chunk(rss_input
, &bytes_written
,
517 &packet_info
.in6_ext_dst
,
518 sizeof(packet_info
.in6_ext_dst
));
520 net_rx_rss_add_chunk(rss_input
, &bytes_written
,
521 &packet_info
.in6_dst
,
522 sizeof(packet_info
.in6_dst
));
528 net_toeplitz_add(&result
, rss_input
, bytes_written
, toe
);
534 SEC("tun_rss_steering")
535 int tun_rss_steering_prog(struct __sk_buff
*skb
)
538 struct rss_config_t
*config
;
539 struct toeplitz_key_data_t
*toe
;
544 config
= bpf_map_lookup_elem(&tap_rss_map_configurations
, &key
);
545 toe
= bpf_map_lookup_elem(&tap_rss_map_toeplitz_key
, &key
);
548 if (!config
->redirect
) {
549 return config
->default_queue
;
552 hash
= calculate_rss_hash(skb
, config
, toe
);
554 __u32 table_idx
= hash
% config
->indirections_len
;
557 queue
= bpf_map_lookup_elem(&tap_rss_map_indirection_table
,
565 return config
->default_queue
;
571 char _license
[] SEC("license") = "GPL v2";