]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - net/netfilter/nf_flow_table_core.c
netfilter: nf_tables: add flowtable offload control plane
[mirror_ubuntu-jammy-kernel.git] / net / netfilter / nf_flow_table_core.c
CommitLineData
09c434b8 1// SPDX-License-Identifier: GPL-2.0-only
ac2a6666
PNA
2#include <linux/kernel.h>
3#include <linux/init.h>
4#include <linux/module.h>
5#include <linux/netfilter.h>
6#include <linux/rhashtable.h>
7#include <linux/netdevice.h>
4f3780c0
FF
8#include <net/ip.h>
9#include <net/ip6_route.h>
c0ea1bcb 10#include <net/netfilter/nf_tables.h>
ac2a6666
PNA
11#include <net/netfilter/nf_flow_table.h>
12#include <net/netfilter/nf_conntrack.h>
13#include <net/netfilter/nf_conntrack_core.h>
40d102cd 14#include <net/netfilter/nf_conntrack_l4proto.h>
ac2a6666
PNA
15#include <net/netfilter/nf_conntrack_tuple.h>
16
84453a90
FF
17static DEFINE_MUTEX(flowtable_lock);
18static LIST_HEAD(flowtables);
19
047b300e
FF
20static void
21flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
047b300e
FF
22 enum flow_offload_tuple_dir dir)
23{
24 struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
25 struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple;
26
27 ft->dir = dir;
28
29 switch (ctt->src.l3num) {
30 case NFPROTO_IPV4:
31 ft->src_v4 = ctt->src.u3.in;
32 ft->dst_v4 = ctt->dst.u3.in;
33 break;
34 case NFPROTO_IPV6:
35 ft->src_v6 = ctt->src.u3.in6;
36 ft->dst_v6 = ctt->dst.u3.in6;
37 break;
38 }
39
40 ft->l3proto = ctt->src.l3num;
41 ft->l4proto = ctt->dst.protonum;
42 ft->src_port = ctt->src.u.tcp.port;
43 ft->dst_port = ctt->dst.u.tcp.port;
047b300e
FF
44}
45
f1363e05 46struct flow_offload *flow_offload_alloc(struct nf_conn *ct)
ac2a6666 47{
ac2a6666
PNA
48 struct flow_offload *flow;
49
50 if (unlikely(nf_ct_is_dying(ct) ||
51 !atomic_inc_not_zero(&ct->ct_general.use)))
52 return NULL;
53
62248df8
PNA
54 flow = kzalloc(sizeof(*flow), GFP_ATOMIC);
55 if (!flow)
ac2a6666
PNA
56 goto err_ct_refcnt;
57
b32d2f34 58 flow->ct = ct;
ac2a6666 59
f1363e05
PNA
60 flow_offload_fill_dir(flow, ct, FLOW_OFFLOAD_DIR_ORIGINAL);
61 flow_offload_fill_dir(flow, ct, FLOW_OFFLOAD_DIR_REPLY);
ac2a6666
PNA
62
63 if (ct->status & IPS_SRC_NAT)
64 flow->flags |= FLOW_OFFLOAD_SNAT;
df1e2025 65 if (ct->status & IPS_DST_NAT)
ac2a6666
PNA
66 flow->flags |= FLOW_OFFLOAD_DNAT;
67
68 return flow;
69
ac2a6666
PNA
70err_ct_refcnt:
71 nf_ct_put(ct);
72
73 return NULL;
74}
75EXPORT_SYMBOL_GPL(flow_offload_alloc);
76
f1363e05
PNA
77static int flow_offload_fill_route(struct flow_offload *flow,
78 const struct nf_flow_route *route,
79 enum flow_offload_tuple_dir dir)
80{
81 struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
82 struct dst_entry *other_dst = route->tuple[!dir].dst;
83 struct dst_entry *dst = route->tuple[dir].dst;
84
85 if (!dst_hold_safe(route->tuple[dir].dst))
86 return -1;
87
88 switch (flow_tuple->l3proto) {
89 case NFPROTO_IPV4:
90 flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true);
91 break;
92 case NFPROTO_IPV6:
93 flow_tuple->mtu = ip6_dst_mtu_forward(dst);
94 break;
95 }
96
97 flow_tuple->iifidx = other_dst->dev->ifindex;
98 flow_tuple->dst_cache = dst;
99
100 return 0;
101}
102
103int flow_offload_route_init(struct flow_offload *flow,
104 const struct nf_flow_route *route)
105{
106 int err;
107
108 err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL);
109 if (err < 0)
110 return err;
111
112 err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY);
113 if (err < 0)
114 goto err_route_reply;
115
116 flow->type = NF_FLOW_OFFLOAD_ROUTE;
117
118 return 0;
119
120err_route_reply:
121 dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst);
122
123 return err;
124}
125EXPORT_SYMBOL_GPL(flow_offload_route_init);
126
da5984e5
FF
127static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
128{
129 tcp->state = TCP_CONNTRACK_ESTABLISHED;
130 tcp->seen[0].td_maxwin = 0;
131 tcp->seen[1].td_maxwin = 0;
132}
133
e97d9404
FW
134#define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT (120 * HZ)
135#define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT (30 * HZ)
136
1e5b2471
PNA
137static inline __s32 nf_flow_timeout_delta(unsigned int timeout)
138{
139 return (__s32)(timeout - (u32)jiffies);
140}
141
142static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
da5984e5
FF
143{
144 const struct nf_conntrack_l4proto *l4proto;
1e5b2471 145 int l4num = nf_ct_protonum(ct);
da5984e5 146 unsigned int timeout;
da5984e5 147
4a60dc74 148 l4proto = nf_ct_l4proto_find(l4num);
da5984e5
FF
149 if (!l4proto)
150 return;
151
da5984e5 152 if (l4num == IPPROTO_TCP)
e97d9404 153 timeout = NF_FLOWTABLE_TCP_PICKUP_TIMEOUT;
da5984e5 154 else if (l4num == IPPROTO_UDP)
e97d9404 155 timeout = NF_FLOWTABLE_UDP_PICKUP_TIMEOUT;
da5984e5
FF
156 else
157 return;
158
1e5b2471
PNA
159 if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout)
160 ct->timeout = nfct_time_stamp + timeout;
161}
162
163static void flow_offload_fixup_ct_state(struct nf_conn *ct)
164{
165 if (nf_ct_protonum(ct) == IPPROTO_TCP)
166 flow_offload_fixup_tcp(&ct->proto.tcp);
167}
168
169static void flow_offload_fixup_ct(struct nf_conn *ct)
170{
171 flow_offload_fixup_ct_state(ct);
172 flow_offload_fixup_ct_timeout(ct);
da5984e5
FF
173}
174
f1363e05 175static void flow_offload_route_release(struct flow_offload *flow)
ac2a6666 176{
ac2a6666
PNA
177 dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
178 dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
f1363e05
PNA
179}
180
181void flow_offload_free(struct flow_offload *flow)
182{
183 switch (flow->type) {
184 case NF_FLOW_OFFLOAD_ROUTE:
185 flow_offload_route_release(flow);
186 break;
187 default:
188 break;
189 }
da5984e5 190 if (flow->flags & FLOW_OFFLOAD_DYING)
b32d2f34
PNA
191 nf_ct_delete(flow->ct, 0, 0);
192 nf_ct_put(flow->ct);
62248df8 193 kfree_rcu(flow, rcu_head);
ac2a6666
PNA
194}
195EXPORT_SYMBOL_GPL(flow_offload_free);
196
a268de77
FF
197static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
198{
199 const struct flow_offload_tuple *tuple = data;
200
201 return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
202}
203
204static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
205{
206 const struct flow_offload_tuple_rhash *tuplehash = data;
207
208 return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
209}
210
211static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
212 const void *ptr)
213{
214 const struct flow_offload_tuple *tuple = arg->key;
215 const struct flow_offload_tuple_rhash *x = ptr;
216
217 if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
218 return 1;
219
220 return 0;
221}
222
223static const struct rhashtable_params nf_flow_offload_rhash_params = {
224 .head_offset = offsetof(struct flow_offload_tuple_rhash, node),
225 .hashfn = flow_offload_hash,
226 .obj_hashfn = flow_offload_hash_obj,
227 .obj_cmpfn = flow_offload_hash_cmp,
228 .automatic_shrinking = true,
229};
230
ac2a6666
PNA
231int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
232{
43c8f131
TY
233 int err;
234
daf61b02
PNA
235 flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
236
43c8f131
TY
237 err = rhashtable_insert_fast(&flow_table->rhashtable,
238 &flow->tuplehash[0].node,
239 nf_flow_offload_rhash_params);
240 if (err < 0)
241 return err;
242
243 err = rhashtable_insert_fast(&flow_table->rhashtable,
244 &flow->tuplehash[1].node,
245 nf_flow_offload_rhash_params);
246 if (err < 0) {
247 rhashtable_remove_fast(&flow_table->rhashtable,
248 &flow->tuplehash[0].node,
249 nf_flow_offload_rhash_params);
250 return err;
251 }
ac2a6666 252
ac2a6666
PNA
253 return 0;
254}
255EXPORT_SYMBOL_GPL(flow_offload_add);
256
3e68db2f
PNA
257static inline bool nf_flow_has_expired(const struct flow_offload *flow)
258{
1e5b2471 259 return nf_flow_timeout_delta(flow->timeout) <= 0;
3e68db2f
PNA
260}
261
0ff90b6c
FF
262static void flow_offload_del(struct nf_flowtable *flow_table,
263 struct flow_offload *flow)
ac2a6666 264{
ac2a6666
PNA
265 rhashtable_remove_fast(&flow_table->rhashtable,
266 &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
a268de77 267 nf_flow_offload_rhash_params);
ac2a6666
PNA
268 rhashtable_remove_fast(&flow_table->rhashtable,
269 &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
a268de77 270 nf_flow_offload_rhash_params);
ac2a6666 271
b32d2f34 272 clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
da5984e5 273
3e68db2f 274 if (nf_flow_has_expired(flow))
b32d2f34 275 flow_offload_fixup_ct(flow->ct);
1e5b2471 276 else if (flow->flags & FLOW_OFFLOAD_TEARDOWN)
b32d2f34 277 flow_offload_fixup_ct_timeout(flow->ct);
3e68db2f 278
0ff90b6c 279 flow_offload_free(flow);
ac2a6666 280}
ac2a6666 281
59c466dd
FF
282void flow_offload_teardown(struct flow_offload *flow)
283{
284 flow->flags |= FLOW_OFFLOAD_TEARDOWN;
da5984e5 285
b32d2f34 286 flow_offload_fixup_ct_state(flow->ct);
59c466dd
FF
287}
288EXPORT_SYMBOL_GPL(flow_offload_teardown);
289
ac2a6666
PNA
290struct flow_offload_tuple_rhash *
291flow_offload_lookup(struct nf_flowtable *flow_table,
292 struct flow_offload_tuple *tuple)
293{
ba03137f
FF
294 struct flow_offload_tuple_rhash *tuplehash;
295 struct flow_offload *flow;
296 int dir;
297
a2d88182
TY
298 tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple,
299 nf_flow_offload_rhash_params);
ba03137f
FF
300 if (!tuplehash)
301 return NULL;
302
303 dir = tuplehash->tuple.dir;
304 flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
305 if (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN))
306 return NULL;
307
b32d2f34 308 if (unlikely(nf_ct_is_dying(flow->ct)))
8cd2bc98
TY
309 return NULL;
310
ba03137f 311 return tuplehash;
ac2a6666
PNA
312}
313EXPORT_SYMBOL_GPL(flow_offload_lookup);
314
49de9c09
TY
315static int
316nf_flow_table_iterate(struct nf_flowtable *flow_table,
317 void (*iter)(struct flow_offload *flow, void *data),
318 void *data)
ac2a6666
PNA
319{
320 struct flow_offload_tuple_rhash *tuplehash;
321 struct rhashtable_iter hti;
322 struct flow_offload *flow;
0de22baa 323 int err = 0;
ac2a6666 324
0de22baa 325 rhashtable_walk_enter(&flow_table->rhashtable, &hti);
ac2a6666
PNA
326 rhashtable_walk_start(&hti);
327
328 while ((tuplehash = rhashtable_walk_next(&hti))) {
329 if (IS_ERR(tuplehash)) {
0de22baa
TY
330 if (PTR_ERR(tuplehash) != -EAGAIN) {
331 err = PTR_ERR(tuplehash);
332 break;
333 }
ac2a6666
PNA
334 continue;
335 }
336 if (tuplehash->tuple.dir)
337 continue;
338
339 flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
340
341 iter(flow, data);
342 }
ac2a6666
PNA
343 rhashtable_walk_stop(&hti);
344 rhashtable_walk_exit(&hti);
345
346 return err;
347}
ac2a6666 348
b9660987 349static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data)
ac2a6666 350{
b9660987 351 struct nf_flowtable *flow_table = data;
ac2a6666 352
b32d2f34 353 if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct) ||
b9660987
TY
354 (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN)))
355 flow_offload_del(flow_table, flow);
b408c5b0
PNA
356}
357
a268de77 358static void nf_flow_offload_work_gc(struct work_struct *work)
b408c5b0
PNA
359{
360 struct nf_flowtable *flow_table;
361
362 flow_table = container_of(work, struct nf_flowtable, gc_work.work);
b9660987 363 nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table);
ac2a6666
PNA
364 queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
365}
ac2a6666
PNA
366
367static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
368 __be16 port, __be16 new_port)
369{
370 struct tcphdr *tcph;
371
372 if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
373 skb_try_make_writable(skb, thoff + sizeof(*tcph)))
374 return -1;
375
376 tcph = (void *)(skb_network_header(skb) + thoff);
377 inet_proto_csum_replace2(&tcph->check, skb, port, new_port, true);
378
379 return 0;
380}
381
382static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
383 __be16 port, __be16 new_port)
384{
385 struct udphdr *udph;
386
387 if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
388 skb_try_make_writable(skb, thoff + sizeof(*udph)))
389 return -1;
390
391 udph = (void *)(skb_network_header(skb) + thoff);
392 if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
393 inet_proto_csum_replace2(&udph->check, skb, port,
394 new_port, true);
395 if (!udph->check)
396 udph->check = CSUM_MANGLED_0;
397 }
398
399 return 0;
400}
401
402static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
403 u8 protocol, __be16 port, __be16 new_port)
404{
405 switch (protocol) {
406 case IPPROTO_TCP:
407 if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0)
408 return NF_DROP;
409 break;
410 case IPPROTO_UDP:
411 if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0)
412 return NF_DROP;
413 break;
414 }
415
416 return 0;
417}
418
419int nf_flow_snat_port(const struct flow_offload *flow,
420 struct sk_buff *skb, unsigned int thoff,
421 u8 protocol, enum flow_offload_tuple_dir dir)
422{
423 struct flow_ports *hdr;
424 __be16 port, new_port;
425
426 if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
427 skb_try_make_writable(skb, thoff + sizeof(*hdr)))
428 return -1;
429
430 hdr = (void *)(skb_network_header(skb) + thoff);
431
432 switch (dir) {
433 case FLOW_OFFLOAD_DIR_ORIGINAL:
434 port = hdr->source;
435 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port;
436 hdr->source = new_port;
437 break;
438 case FLOW_OFFLOAD_DIR_REPLY:
439 port = hdr->dest;
440 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port;
441 hdr->dest = new_port;
442 break;
443 default:
444 return -1;
445 }
446
447 return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
448}
449EXPORT_SYMBOL_GPL(nf_flow_snat_port);
450
451int nf_flow_dnat_port(const struct flow_offload *flow,
452 struct sk_buff *skb, unsigned int thoff,
453 u8 protocol, enum flow_offload_tuple_dir dir)
454{
455 struct flow_ports *hdr;
456 __be16 port, new_port;
457
458 if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
459 skb_try_make_writable(skb, thoff + sizeof(*hdr)))
460 return -1;
461
462 hdr = (void *)(skb_network_header(skb) + thoff);
463
464 switch (dir) {
465 case FLOW_OFFLOAD_DIR_ORIGINAL:
466 port = hdr->dest;
467 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port;
468 hdr->dest = new_port;
469 break;
470 case FLOW_OFFLOAD_DIR_REPLY:
471 port = hdr->source;
472 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port;
473 hdr->source = new_port;
474 break;
475 default:
476 return -1;
477 }
478
479 return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
480}
481EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
482
a268de77
FF
483int nf_flow_table_init(struct nf_flowtable *flowtable)
484{
485 int err;
486
487 INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
488
489 err = rhashtable_init(&flowtable->rhashtable,
490 &nf_flow_offload_rhash_params);
491 if (err < 0)
492 return err;
493
494 queue_delayed_work(system_power_efficient_wq,
495 &flowtable->gc_work, HZ);
496
84453a90
FF
497 mutex_lock(&flowtable_lock);
498 list_add(&flowtable->list, &flowtables);
499 mutex_unlock(&flowtable_lock);
500
a268de77
FF
501 return 0;
502}
503EXPORT_SYMBOL_GPL(nf_flow_table_init);
504
c0ea1bcb
PNA
505static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
506{
507 struct net_device *dev = data;
508
59c466dd
FF
509 if (!dev) {
510 flow_offload_teardown(flow);
c0ea1bcb 511 return;
59c466dd 512 }
b32d2f34
PNA
513
514 if (net_eq(nf_ct_net(flow->ct), dev_net(dev)) &&
a3fb3698
TY
515 (flow->tuplehash[0].tuple.iifidx == dev->ifindex ||
516 flow->tuplehash[1].tuple.iifidx == dev->ifindex))
59c466dd 517 flow_offload_dead(flow);
c0ea1bcb
PNA
518}
519
520static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable,
84453a90 521 struct net_device *dev)
c0ea1bcb 522{
84453a90 523 nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev);
c0ea1bcb
PNA
524 flush_delayed_work(&flowtable->gc_work);
525}
526
5f1be84a 527void nf_flow_table_cleanup(struct net_device *dev)
c0ea1bcb 528{
84453a90
FF
529 struct nf_flowtable *flowtable;
530
531 mutex_lock(&flowtable_lock);
532 list_for_each_entry(flowtable, &flowtables, list)
533 nf_flow_table_iterate_cleanup(flowtable, dev);
534 mutex_unlock(&flowtable_lock);
c0ea1bcb
PNA
535}
536EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
537
b408c5b0
PNA
538void nf_flow_table_free(struct nf_flowtable *flow_table)
539{
84453a90
FF
540 mutex_lock(&flowtable_lock);
541 list_del(&flow_table->list);
542 mutex_unlock(&flowtable_lock);
a268de77 543 cancel_delayed_work_sync(&flow_table->gc_work);
b408c5b0 544 nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
b9660987 545 nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table);
a268de77 546 rhashtable_destroy(&flow_table->rhashtable);
b408c5b0
PNA
547}
548EXPORT_SYMBOL_GPL(nf_flow_table_free);
549
ac2a6666
PNA
550MODULE_LICENSE("GPL");
551MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");