1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/kernel.h>
3 #include <linux/init.h>
4 #include <linux/module.h>
5 #include <linux/netfilter.h>
6 #include <linux/rhashtable.h>
7 #include <linux/netdevice.h>
9 #include <net/ip6_route.h>
10 #include <net/netfilter/nf_tables.h>
11 #include <net/netfilter/nf_flow_table.h>
12 #include <net/netfilter/nf_conntrack.h>
13 #include <net/netfilter/nf_conntrack_core.h>
14 #include <net/netfilter/nf_conntrack_l4proto.h>
15 #include <net/netfilter/nf_conntrack_tuple.h>
17 static DEFINE_MUTEX(flowtable_lock
);
18 static LIST_HEAD(flowtables
);
21 flow_offload_fill_dir(struct flow_offload
*flow
,
22 enum flow_offload_tuple_dir dir
)
24 struct flow_offload_tuple
*ft
= &flow
->tuplehash
[dir
].tuple
;
25 struct nf_conntrack_tuple
*ctt
= &flow
->ct
->tuplehash
[dir
].tuple
;
29 switch (ctt
->src
.l3num
) {
31 ft
->src_v4
= ctt
->src
.u3
.in
;
32 ft
->dst_v4
= ctt
->dst
.u3
.in
;
35 ft
->src_v6
= ctt
->src
.u3
.in6
;
36 ft
->dst_v6
= ctt
->dst
.u3
.in6
;
40 ft
->l3proto
= ctt
->src
.l3num
;
41 ft
->l4proto
= ctt
->dst
.protonum
;
42 ft
->src_port
= ctt
->src
.u
.tcp
.port
;
43 ft
->dst_port
= ctt
->dst
.u
.tcp
.port
;
46 struct flow_offload
*flow_offload_alloc(struct nf_conn
*ct
)
48 struct flow_offload
*flow
;
50 if (unlikely(nf_ct_is_dying(ct
) ||
51 !atomic_inc_not_zero(&ct
->ct_general
.use
)))
54 flow
= kzalloc(sizeof(*flow
), GFP_ATOMIC
);
60 flow_offload_fill_dir(flow
, FLOW_OFFLOAD_DIR_ORIGINAL
);
61 flow_offload_fill_dir(flow
, FLOW_OFFLOAD_DIR_REPLY
);
63 if (ct
->status
& IPS_SRC_NAT
)
64 __set_bit(NF_FLOW_SNAT
, &flow
->flags
);
65 if (ct
->status
& IPS_DST_NAT
)
66 __set_bit(NF_FLOW_DNAT
, &flow
->flags
);
75 EXPORT_SYMBOL_GPL(flow_offload_alloc
);
77 static int flow_offload_fill_route(struct flow_offload
*flow
,
78 const struct nf_flow_route
*route
,
79 enum flow_offload_tuple_dir dir
)
81 struct flow_offload_tuple
*flow_tuple
= &flow
->tuplehash
[dir
].tuple
;
82 struct dst_entry
*dst
= route
->tuple
[dir
].dst
;
85 switch (flow_tuple
->l3proto
) {
87 flow_tuple
->mtu
= ip_dst_mtu_maybe_forward(dst
, true);
90 flow_tuple
->mtu
= ip6_dst_mtu_forward(dst
);
94 flow_tuple
->iifidx
= route
->tuple
[dir
].in
.ifindex
;
95 for (i
= route
->tuple
[dir
].in
.num_encaps
- 1; i
>= 0; i
--) {
96 flow_tuple
->encap
[j
].id
= route
->tuple
[dir
].in
.encap
[i
].id
;
97 flow_tuple
->encap
[j
].proto
= route
->tuple
[dir
].in
.encap
[i
].proto
;
100 flow_tuple
->encap_num
= route
->tuple
[dir
].in
.num_encaps
;
102 switch (route
->tuple
[dir
].xmit_type
) {
103 case FLOW_OFFLOAD_XMIT_DIRECT
:
104 memcpy(flow_tuple
->out
.h_dest
, route
->tuple
[dir
].out
.h_dest
,
106 memcpy(flow_tuple
->out
.h_source
, route
->tuple
[dir
].out
.h_source
,
108 flow_tuple
->out
.ifidx
= route
->tuple
[dir
].out
.ifindex
;
110 case FLOW_OFFLOAD_XMIT_XFRM
:
111 case FLOW_OFFLOAD_XMIT_NEIGH
:
112 if (!dst_hold_safe(route
->tuple
[dir
].dst
))
115 flow_tuple
->dst_cache
= dst
;
118 flow_tuple
->xmit_type
= route
->tuple
[dir
].xmit_type
;
123 static void nft_flow_dst_release(struct flow_offload
*flow
,
124 enum flow_offload_tuple_dir dir
)
126 if (flow
->tuplehash
[dir
].tuple
.xmit_type
== FLOW_OFFLOAD_XMIT_NEIGH
||
127 flow
->tuplehash
[dir
].tuple
.xmit_type
== FLOW_OFFLOAD_XMIT_XFRM
)
128 dst_release(flow
->tuplehash
[dir
].tuple
.dst_cache
);
131 int flow_offload_route_init(struct flow_offload
*flow
,
132 const struct nf_flow_route
*route
)
136 err
= flow_offload_fill_route(flow
, route
, FLOW_OFFLOAD_DIR_ORIGINAL
);
140 err
= flow_offload_fill_route(flow
, route
, FLOW_OFFLOAD_DIR_REPLY
);
142 goto err_route_reply
;
144 flow
->type
= NF_FLOW_OFFLOAD_ROUTE
;
149 nft_flow_dst_release(flow
, FLOW_OFFLOAD_DIR_ORIGINAL
);
153 EXPORT_SYMBOL_GPL(flow_offload_route_init
);
155 static void flow_offload_fixup_tcp(struct ip_ct_tcp
*tcp
)
157 tcp
->state
= TCP_CONNTRACK_ESTABLISHED
;
158 tcp
->seen
[0].td_maxwin
= 0;
159 tcp
->seen
[1].td_maxwin
= 0;
162 #define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT (120 * HZ)
163 #define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT (30 * HZ)
165 static void flow_offload_fixup_ct_timeout(struct nf_conn
*ct
)
167 const struct nf_conntrack_l4proto
*l4proto
;
168 int l4num
= nf_ct_protonum(ct
);
169 unsigned int timeout
;
171 l4proto
= nf_ct_l4proto_find(l4num
);
175 if (l4num
== IPPROTO_TCP
)
176 timeout
= NF_FLOWTABLE_TCP_PICKUP_TIMEOUT
;
177 else if (l4num
== IPPROTO_UDP
)
178 timeout
= NF_FLOWTABLE_UDP_PICKUP_TIMEOUT
;
182 if (nf_flow_timeout_delta(ct
->timeout
) > (__s32
)timeout
)
183 ct
->timeout
= nfct_time_stamp
+ timeout
;
186 static void flow_offload_fixup_ct_state(struct nf_conn
*ct
)
188 if (nf_ct_protonum(ct
) == IPPROTO_TCP
)
189 flow_offload_fixup_tcp(&ct
->proto
.tcp
);
192 static void flow_offload_fixup_ct(struct nf_conn
*ct
)
194 flow_offload_fixup_ct_state(ct
);
195 flow_offload_fixup_ct_timeout(ct
);
198 static void flow_offload_route_release(struct flow_offload
*flow
)
200 nft_flow_dst_release(flow
, FLOW_OFFLOAD_DIR_ORIGINAL
);
201 nft_flow_dst_release(flow
, FLOW_OFFLOAD_DIR_REPLY
);
204 void flow_offload_free(struct flow_offload
*flow
)
206 switch (flow
->type
) {
207 case NF_FLOW_OFFLOAD_ROUTE
:
208 flow_offload_route_release(flow
);
214 kfree_rcu(flow
, rcu_head
);
216 EXPORT_SYMBOL_GPL(flow_offload_free
);
218 static u32
flow_offload_hash(const void *data
, u32 len
, u32 seed
)
220 const struct flow_offload_tuple
*tuple
= data
;
222 return jhash(tuple
, offsetof(struct flow_offload_tuple
, __hash
), seed
);
225 static u32
flow_offload_hash_obj(const void *data
, u32 len
, u32 seed
)
227 const struct flow_offload_tuple_rhash
*tuplehash
= data
;
229 return jhash(&tuplehash
->tuple
, offsetof(struct flow_offload_tuple
, __hash
), seed
);
232 static int flow_offload_hash_cmp(struct rhashtable_compare_arg
*arg
,
235 const struct flow_offload_tuple
*tuple
= arg
->key
;
236 const struct flow_offload_tuple_rhash
*x
= ptr
;
238 if (memcmp(&x
->tuple
, tuple
, offsetof(struct flow_offload_tuple
, __hash
)))
244 static const struct rhashtable_params nf_flow_offload_rhash_params
= {
245 .head_offset
= offsetof(struct flow_offload_tuple_rhash
, node
),
246 .hashfn
= flow_offload_hash
,
247 .obj_hashfn
= flow_offload_hash_obj
,
248 .obj_cmpfn
= flow_offload_hash_cmp
,
249 .automatic_shrinking
= true,
252 int flow_offload_add(struct nf_flowtable
*flow_table
, struct flow_offload
*flow
)
256 flow
->timeout
= nf_flowtable_time_stamp
+ NF_FLOW_TIMEOUT
;
258 err
= rhashtable_insert_fast(&flow_table
->rhashtable
,
259 &flow
->tuplehash
[0].node
,
260 nf_flow_offload_rhash_params
);
264 err
= rhashtable_insert_fast(&flow_table
->rhashtable
,
265 &flow
->tuplehash
[1].node
,
266 nf_flow_offload_rhash_params
);
268 rhashtable_remove_fast(&flow_table
->rhashtable
,
269 &flow
->tuplehash
[0].node
,
270 nf_flow_offload_rhash_params
);
274 nf_ct_offload_timeout(flow
->ct
);
276 if (nf_flowtable_hw_offload(flow_table
)) {
277 __set_bit(NF_FLOW_HW
, &flow
->flags
);
278 nf_flow_offload_add(flow_table
, flow
);
283 EXPORT_SYMBOL_GPL(flow_offload_add
);
285 void flow_offload_refresh(struct nf_flowtable
*flow_table
,
286 struct flow_offload
*flow
)
288 flow
->timeout
= nf_flowtable_time_stamp
+ NF_FLOW_TIMEOUT
;
290 if (likely(!nf_flowtable_hw_offload(flow_table
) ||
291 !test_and_clear_bit(NF_FLOW_HW_REFRESH
, &flow
->flags
)))
294 nf_flow_offload_add(flow_table
, flow
);
296 EXPORT_SYMBOL_GPL(flow_offload_refresh
);
298 static inline bool nf_flow_has_expired(const struct flow_offload
*flow
)
300 return nf_flow_timeout_delta(flow
->timeout
) <= 0;
303 static void flow_offload_del(struct nf_flowtable
*flow_table
,
304 struct flow_offload
*flow
)
306 rhashtable_remove_fast(&flow_table
->rhashtable
,
307 &flow
->tuplehash
[FLOW_OFFLOAD_DIR_ORIGINAL
].node
,
308 nf_flow_offload_rhash_params
);
309 rhashtable_remove_fast(&flow_table
->rhashtable
,
310 &flow
->tuplehash
[FLOW_OFFLOAD_DIR_REPLY
].node
,
311 nf_flow_offload_rhash_params
);
313 clear_bit(IPS_OFFLOAD_BIT
, &flow
->ct
->status
);
315 if (nf_flow_has_expired(flow
))
316 flow_offload_fixup_ct(flow
->ct
);
318 flow_offload_fixup_ct_timeout(flow
->ct
);
320 flow_offload_free(flow
);
323 void flow_offload_teardown(struct flow_offload
*flow
)
325 set_bit(NF_FLOW_TEARDOWN
, &flow
->flags
);
327 flow_offload_fixup_ct_state(flow
->ct
);
329 EXPORT_SYMBOL_GPL(flow_offload_teardown
);
331 struct flow_offload_tuple_rhash
*
332 flow_offload_lookup(struct nf_flowtable
*flow_table
,
333 struct flow_offload_tuple
*tuple
)
335 struct flow_offload_tuple_rhash
*tuplehash
;
336 struct flow_offload
*flow
;
339 tuplehash
= rhashtable_lookup(&flow_table
->rhashtable
, tuple
,
340 nf_flow_offload_rhash_params
);
344 dir
= tuplehash
->tuple
.dir
;
345 flow
= container_of(tuplehash
, struct flow_offload
, tuplehash
[dir
]);
346 if (test_bit(NF_FLOW_TEARDOWN
, &flow
->flags
))
349 if (unlikely(nf_ct_is_dying(flow
->ct
)))
354 EXPORT_SYMBOL_GPL(flow_offload_lookup
);
357 nf_flow_table_iterate(struct nf_flowtable
*flow_table
,
358 void (*iter
)(struct flow_offload
*flow
, void *data
),
361 struct flow_offload_tuple_rhash
*tuplehash
;
362 struct rhashtable_iter hti
;
363 struct flow_offload
*flow
;
366 rhashtable_walk_enter(&flow_table
->rhashtable
, &hti
);
367 rhashtable_walk_start(&hti
);
369 while ((tuplehash
= rhashtable_walk_next(&hti
))) {
370 if (IS_ERR(tuplehash
)) {
371 if (PTR_ERR(tuplehash
) != -EAGAIN
) {
372 err
= PTR_ERR(tuplehash
);
377 if (tuplehash
->tuple
.dir
)
380 flow
= container_of(tuplehash
, struct flow_offload
, tuplehash
[0]);
384 rhashtable_walk_stop(&hti
);
385 rhashtable_walk_exit(&hti
);
390 static void nf_flow_offload_gc_step(struct flow_offload
*flow
, void *data
)
392 struct nf_flowtable
*flow_table
= data
;
394 if (nf_flow_has_expired(flow
) || nf_ct_is_dying(flow
->ct
))
395 set_bit(NF_FLOW_TEARDOWN
, &flow
->flags
);
397 if (test_bit(NF_FLOW_TEARDOWN
, &flow
->flags
)) {
398 if (test_bit(NF_FLOW_HW
, &flow
->flags
)) {
399 if (!test_bit(NF_FLOW_HW_DYING
, &flow
->flags
))
400 nf_flow_offload_del(flow_table
, flow
);
401 else if (test_bit(NF_FLOW_HW_DEAD
, &flow
->flags
))
402 flow_offload_del(flow_table
, flow
);
404 flow_offload_del(flow_table
, flow
);
406 } else if (test_bit(NF_FLOW_HW
, &flow
->flags
)) {
407 nf_flow_offload_stats(flow_table
, flow
);
411 static void nf_flow_offload_work_gc(struct work_struct
*work
)
413 struct nf_flowtable
*flow_table
;
415 flow_table
= container_of(work
, struct nf_flowtable
, gc_work
.work
);
416 nf_flow_table_iterate(flow_table
, nf_flow_offload_gc_step
, flow_table
);
417 queue_delayed_work(system_power_efficient_wq
, &flow_table
->gc_work
, HZ
);
420 static void nf_flow_nat_port_tcp(struct sk_buff
*skb
, unsigned int thoff
,
421 __be16 port
, __be16 new_port
)
425 tcph
= (void *)(skb_network_header(skb
) + thoff
);
426 inet_proto_csum_replace2(&tcph
->check
, skb
, port
, new_port
, false);
429 static void nf_flow_nat_port_udp(struct sk_buff
*skb
, unsigned int thoff
,
430 __be16 port
, __be16 new_port
)
434 udph
= (void *)(skb_network_header(skb
) + thoff
);
435 if (udph
->check
|| skb
->ip_summed
== CHECKSUM_PARTIAL
) {
436 inet_proto_csum_replace2(&udph
->check
, skb
, port
,
439 udph
->check
= CSUM_MANGLED_0
;
443 static void nf_flow_nat_port(struct sk_buff
*skb
, unsigned int thoff
,
444 u8 protocol
, __be16 port
, __be16 new_port
)
448 nf_flow_nat_port_tcp(skb
, thoff
, port
, new_port
);
451 nf_flow_nat_port_udp(skb
, thoff
, port
, new_port
);
456 void nf_flow_snat_port(const struct flow_offload
*flow
,
457 struct sk_buff
*skb
, unsigned int thoff
,
458 u8 protocol
, enum flow_offload_tuple_dir dir
)
460 struct flow_ports
*hdr
;
461 __be16 port
, new_port
;
463 hdr
= (void *)(skb_network_header(skb
) + thoff
);
466 case FLOW_OFFLOAD_DIR_ORIGINAL
:
468 new_port
= flow
->tuplehash
[FLOW_OFFLOAD_DIR_REPLY
].tuple
.dst_port
;
469 hdr
->source
= new_port
;
471 case FLOW_OFFLOAD_DIR_REPLY
:
473 new_port
= flow
->tuplehash
[FLOW_OFFLOAD_DIR_ORIGINAL
].tuple
.src_port
;
474 hdr
->dest
= new_port
;
478 nf_flow_nat_port(skb
, thoff
, protocol
, port
, new_port
);
480 EXPORT_SYMBOL_GPL(nf_flow_snat_port
);
482 void nf_flow_dnat_port(const struct flow_offload
*flow
, struct sk_buff
*skb
,
483 unsigned int thoff
, u8 protocol
,
484 enum flow_offload_tuple_dir dir
)
486 struct flow_ports
*hdr
;
487 __be16 port
, new_port
;
489 hdr
= (void *)(skb_network_header(skb
) + thoff
);
492 case FLOW_OFFLOAD_DIR_ORIGINAL
:
494 new_port
= flow
->tuplehash
[FLOW_OFFLOAD_DIR_REPLY
].tuple
.src_port
;
495 hdr
->dest
= new_port
;
497 case FLOW_OFFLOAD_DIR_REPLY
:
499 new_port
= flow
->tuplehash
[FLOW_OFFLOAD_DIR_ORIGINAL
].tuple
.dst_port
;
500 hdr
->source
= new_port
;
504 nf_flow_nat_port(skb
, thoff
, protocol
, port
, new_port
);
506 EXPORT_SYMBOL_GPL(nf_flow_dnat_port
);
508 int nf_flow_table_init(struct nf_flowtable
*flowtable
)
512 INIT_DEFERRABLE_WORK(&flowtable
->gc_work
, nf_flow_offload_work_gc
);
513 flow_block_init(&flowtable
->flow_block
);
514 init_rwsem(&flowtable
->flow_block_lock
);
516 err
= rhashtable_init(&flowtable
->rhashtable
,
517 &nf_flow_offload_rhash_params
);
521 queue_delayed_work(system_power_efficient_wq
,
522 &flowtable
->gc_work
, HZ
);
524 mutex_lock(&flowtable_lock
);
525 list_add(&flowtable
->list
, &flowtables
);
526 mutex_unlock(&flowtable_lock
);
530 EXPORT_SYMBOL_GPL(nf_flow_table_init
);
532 static void nf_flow_table_do_cleanup(struct flow_offload
*flow
, void *data
)
534 struct net_device
*dev
= data
;
537 flow_offload_teardown(flow
);
541 if (net_eq(nf_ct_net(flow
->ct
), dev_net(dev
)) &&
542 (flow
->tuplehash
[0].tuple
.iifidx
== dev
->ifindex
||
543 flow
->tuplehash
[1].tuple
.iifidx
== dev
->ifindex
))
544 flow_offload_teardown(flow
);
547 void nf_flow_table_gc_cleanup(struct nf_flowtable
*flowtable
,
548 struct net_device
*dev
)
550 nf_flow_table_iterate(flowtable
, nf_flow_table_do_cleanup
, dev
);
551 flush_delayed_work(&flowtable
->gc_work
);
552 nf_flow_table_offload_flush(flowtable
);
555 void nf_flow_table_cleanup(struct net_device
*dev
)
557 struct nf_flowtable
*flowtable
;
559 mutex_lock(&flowtable_lock
);
560 list_for_each_entry(flowtable
, &flowtables
, list
)
561 nf_flow_table_gc_cleanup(flowtable
, dev
);
562 mutex_unlock(&flowtable_lock
);
564 EXPORT_SYMBOL_GPL(nf_flow_table_cleanup
);
566 void nf_flow_table_free(struct nf_flowtable
*flow_table
)
568 mutex_lock(&flowtable_lock
);
569 list_del(&flow_table
->list
);
570 mutex_unlock(&flowtable_lock
);
572 cancel_delayed_work_sync(&flow_table
->gc_work
);
573 nf_flow_table_iterate(flow_table
, nf_flow_table_do_cleanup
, NULL
);
574 nf_flow_table_iterate(flow_table
, nf_flow_offload_gc_step
, flow_table
);
575 nf_flow_table_offload_flush(flow_table
);
576 if (nf_flowtable_hw_offload(flow_table
))
577 nf_flow_table_iterate(flow_table
, nf_flow_offload_gc_step
,
579 rhashtable_destroy(&flow_table
->rhashtable
);
581 EXPORT_SYMBOL_GPL(nf_flow_table_free
);
583 static int __init
nf_flow_table_module_init(void)
585 return nf_flow_table_offload_init();
588 static void __exit
nf_flow_table_module_exit(void)
590 nf_flow_table_offload_exit();
593 module_init(nf_flow_table_module_init
);
594 module_exit(nf_flow_table_module_exit
);
596 MODULE_LICENSE("GPL");
597 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
598 MODULE_DESCRIPTION("Netfilter flow table module");