1 // SPDX-License-Identifier: GPL-2.0-only
3 * (C) 1999-2001 Paul `Rusty' Russell
4 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
5 * (C) 2011 Patrick McHardy <kaber@trash.net>
8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10 #include <linux/module.h>
11 #include <linux/types.h>
12 #include <linux/timer.h>
13 #include <linux/skbuff.h>
14 #include <linux/gfp.h>
16 #include <linux/jhash.h>
17 #include <linux/rtnetlink.h>
19 #include <net/netfilter/nf_conntrack.h>
20 #include <net/netfilter/nf_conntrack_core.h>
21 #include <net/netfilter/nf_conntrack_helper.h>
22 #include <net/netfilter/nf_conntrack_seqadj.h>
23 #include <net/netfilter/nf_conntrack_zones.h>
24 #include <net/netfilter/nf_nat.h>
25 #include <net/netfilter/nf_nat_helper.h>
26 #include <uapi/linux/netfilter/nf_nat.h>
28 #include "nf_internals.h"
30 static spinlock_t nf_nat_locks
[CONNTRACK_LOCKS
];
32 static DEFINE_MUTEX(nf_nat_proto_mutex
);
33 static unsigned int nat_net_id __read_mostly
;
35 static struct hlist_head
*nf_nat_bysource __read_mostly
;
36 static unsigned int nf_nat_htable_size __read_mostly
;
37 static unsigned int nf_nat_hash_rnd __read_mostly
;
39 struct nf_nat_lookup_hook_priv
{
40 struct nf_hook_entries __rcu
*entries
;
42 struct rcu_head rcu_head
;
45 struct nf_nat_hooks_net
{
46 struct nf_hook_ops
*nat_hook_ops
;
51 struct nf_nat_hooks_net nat_proto_net
[NFPROTO_NUMPROTO
];
55 static void nf_nat_ipv4_decode_session(struct sk_buff
*skb
,
56 const struct nf_conn
*ct
,
57 enum ip_conntrack_dir dir
,
58 unsigned long statusbit
,
61 const struct nf_conntrack_tuple
*t
= &ct
->tuplehash
[dir
].tuple
;
62 struct flowi4
*fl4
= &fl
->u
.ip4
;
64 if (ct
->status
& statusbit
) {
65 fl4
->daddr
= t
->dst
.u3
.ip
;
66 if (t
->dst
.protonum
== IPPROTO_TCP
||
67 t
->dst
.protonum
== IPPROTO_UDP
||
68 t
->dst
.protonum
== IPPROTO_UDPLITE
||
69 t
->dst
.protonum
== IPPROTO_DCCP
||
70 t
->dst
.protonum
== IPPROTO_SCTP
)
71 fl4
->fl4_dport
= t
->dst
.u
.all
;
74 statusbit
^= IPS_NAT_MASK
;
76 if (ct
->status
& statusbit
) {
77 fl4
->saddr
= t
->src
.u3
.ip
;
78 if (t
->dst
.protonum
== IPPROTO_TCP
||
79 t
->dst
.protonum
== IPPROTO_UDP
||
80 t
->dst
.protonum
== IPPROTO_UDPLITE
||
81 t
->dst
.protonum
== IPPROTO_DCCP
||
82 t
->dst
.protonum
== IPPROTO_SCTP
)
83 fl4
->fl4_sport
= t
->src
.u
.all
;
87 static void nf_nat_ipv6_decode_session(struct sk_buff
*skb
,
88 const struct nf_conn
*ct
,
89 enum ip_conntrack_dir dir
,
90 unsigned long statusbit
,
93 #if IS_ENABLED(CONFIG_IPV6)
94 const struct nf_conntrack_tuple
*t
= &ct
->tuplehash
[dir
].tuple
;
95 struct flowi6
*fl6
= &fl
->u
.ip6
;
97 if (ct
->status
& statusbit
) {
98 fl6
->daddr
= t
->dst
.u3
.in6
;
99 if (t
->dst
.protonum
== IPPROTO_TCP
||
100 t
->dst
.protonum
== IPPROTO_UDP
||
101 t
->dst
.protonum
== IPPROTO_UDPLITE
||
102 t
->dst
.protonum
== IPPROTO_DCCP
||
103 t
->dst
.protonum
== IPPROTO_SCTP
)
104 fl6
->fl6_dport
= t
->dst
.u
.all
;
107 statusbit
^= IPS_NAT_MASK
;
109 if (ct
->status
& statusbit
) {
110 fl6
->saddr
= t
->src
.u3
.in6
;
111 if (t
->dst
.protonum
== IPPROTO_TCP
||
112 t
->dst
.protonum
== IPPROTO_UDP
||
113 t
->dst
.protonum
== IPPROTO_UDPLITE
||
114 t
->dst
.protonum
== IPPROTO_DCCP
||
115 t
->dst
.protonum
== IPPROTO_SCTP
)
116 fl6
->fl6_sport
= t
->src
.u
.all
;
121 static void __nf_nat_decode_session(struct sk_buff
*skb
, struct flowi
*fl
)
123 const struct nf_conn
*ct
;
124 enum ip_conntrack_info ctinfo
;
125 enum ip_conntrack_dir dir
;
126 unsigned long statusbit
;
129 ct
= nf_ct_get(skb
, &ctinfo
);
133 family
= nf_ct_l3num(ct
);
134 dir
= CTINFO2DIR(ctinfo
);
135 if (dir
== IP_CT_DIR_ORIGINAL
)
136 statusbit
= IPS_DST_NAT
;
138 statusbit
= IPS_SRC_NAT
;
142 nf_nat_ipv4_decode_session(skb
, ct
, dir
, statusbit
, fl
);
145 nf_nat_ipv6_decode_session(skb
, ct
, dir
, statusbit
, fl
);
150 int nf_xfrm_me_harder(struct net
*net
, struct sk_buff
*skb
, unsigned int family
)
154 struct dst_entry
*dst
;
155 struct sock
*sk
= skb
->sk
;
158 err
= xfrm_decode_session(skb
, &fl
, family
);
164 dst
= ((struct xfrm_dst
*)dst
)->route
;
165 if (!dst_hold_safe(dst
))
166 return -EHOSTUNREACH
;
168 if (sk
&& !net_eq(net
, sock_net(sk
)))
171 dst
= xfrm_lookup(net
, dst
, &fl
, sk
, 0);
176 skb_dst_set(skb
, dst
);
178 /* Change in oif may mean change in hh_len. */
179 hh_len
= skb_dst(skb
)->dev
->hard_header_len
;
180 if (skb_headroom(skb
) < hh_len
&&
181 pskb_expand_head(skb
, hh_len
- skb_headroom(skb
), 0, GFP_ATOMIC
))
185 EXPORT_SYMBOL(nf_xfrm_me_harder
);
186 #endif /* CONFIG_XFRM */
188 /* We keep an extra hash for each conntrack, for fast searching. */
190 hash_by_src(const struct net
*n
, const struct nf_conntrack_tuple
*tuple
)
194 get_random_once(&nf_nat_hash_rnd
, sizeof(nf_nat_hash_rnd
));
196 /* Original src, to ensure we map it consistently if poss. */
197 hash
= jhash2((u32
*)&tuple
->src
, sizeof(tuple
->src
) / sizeof(u32
),
198 tuple
->dst
.protonum
^ nf_nat_hash_rnd
^ net_hash_mix(n
));
200 return reciprocal_scale(hash
, nf_nat_htable_size
);
203 /* Is this tuple already taken? (not by us) */
205 nf_nat_used_tuple(const struct nf_conntrack_tuple
*tuple
,
206 const struct nf_conn
*ignored_conntrack
)
208 /* Conntrack tracking doesn't keep track of outgoing tuples; only
209 * incoming ones. NAT means they don't have a fixed mapping,
210 * so we invert the tuple and look for the incoming reply.
212 * We could keep a separate hash if this proves too slow.
214 struct nf_conntrack_tuple reply
;
216 nf_ct_invert_tuple(&reply
, tuple
);
217 return nf_conntrack_tuple_taken(&reply
, ignored_conntrack
);
220 static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple
*t
,
221 const struct nf_nat_range2
*range
)
223 if (t
->src
.l3num
== NFPROTO_IPV4
)
224 return ntohl(t
->src
.u3
.ip
) >= ntohl(range
->min_addr
.ip
) &&
225 ntohl(t
->src
.u3
.ip
) <= ntohl(range
->max_addr
.ip
);
227 return ipv6_addr_cmp(&t
->src
.u3
.in6
, &range
->min_addr
.in6
) >= 0 &&
228 ipv6_addr_cmp(&t
->src
.u3
.in6
, &range
->max_addr
.in6
) <= 0;
231 /* Is the manipable part of the tuple between min and max incl? */
232 static bool l4proto_in_range(const struct nf_conntrack_tuple
*tuple
,
233 enum nf_nat_manip_type maniptype
,
234 const union nf_conntrack_man_proto
*min
,
235 const union nf_conntrack_man_proto
*max
)
239 switch (tuple
->dst
.protonum
) {
242 return ntohs(tuple
->src
.u
.icmp
.id
) >= ntohs(min
->icmp
.id
) &&
243 ntohs(tuple
->src
.u
.icmp
.id
) <= ntohs(max
->icmp
.id
);
244 case IPPROTO_GRE
: /* all fall though */
247 case IPPROTO_UDPLITE
:
250 if (maniptype
== NF_NAT_MANIP_SRC
)
251 port
= tuple
->src
.u
.all
;
253 port
= tuple
->dst
.u
.all
;
255 return ntohs(port
) >= ntohs(min
->all
) &&
256 ntohs(port
) <= ntohs(max
->all
);
262 /* If we source map this tuple so reply looks like reply_tuple, will
263 * that meet the constraints of range.
265 static int in_range(const struct nf_conntrack_tuple
*tuple
,
266 const struct nf_nat_range2
*range
)
268 /* If we are supposed to map IPs, then we must be in the
269 * range specified, otherwise let this drag us onto a new src IP.
271 if (range
->flags
& NF_NAT_RANGE_MAP_IPS
&&
272 !nf_nat_inet_in_range(tuple
, range
))
275 if (!(range
->flags
& NF_NAT_RANGE_PROTO_SPECIFIED
))
278 return l4proto_in_range(tuple
, NF_NAT_MANIP_SRC
,
279 &range
->min_proto
, &range
->max_proto
);
283 same_src(const struct nf_conn
*ct
,
284 const struct nf_conntrack_tuple
*tuple
)
286 const struct nf_conntrack_tuple
*t
;
288 t
= &ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].tuple
;
289 return (t
->dst
.protonum
== tuple
->dst
.protonum
&&
290 nf_inet_addr_cmp(&t
->src
.u3
, &tuple
->src
.u3
) &&
291 t
->src
.u
.all
== tuple
->src
.u
.all
);
294 /* Only called for SRC manip */
296 find_appropriate_src(struct net
*net
,
297 const struct nf_conntrack_zone
*zone
,
298 const struct nf_conntrack_tuple
*tuple
,
299 struct nf_conntrack_tuple
*result
,
300 const struct nf_nat_range2
*range
)
302 unsigned int h
= hash_by_src(net
, tuple
);
303 const struct nf_conn
*ct
;
305 hlist_for_each_entry_rcu(ct
, &nf_nat_bysource
[h
], nat_bysource
) {
306 if (same_src(ct
, tuple
) &&
307 net_eq(net
, nf_ct_net(ct
)) &&
308 nf_ct_zone_equal(ct
, zone
, IP_CT_DIR_ORIGINAL
)) {
309 /* Copy source part from reply tuple. */
310 nf_ct_invert_tuple(result
,
311 &ct
->tuplehash
[IP_CT_DIR_REPLY
].tuple
);
312 result
->dst
= tuple
->dst
;
314 if (in_range(result
, range
))
321 /* For [FUTURE] fragmentation handling, we want the least-used
322 * src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
323 * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
324 * 1-65535, we don't do pro-rata allocation based on ports; we choose
325 * the ip with the lowest src-ip/dst-ip/proto usage.
328 find_best_ips_proto(const struct nf_conntrack_zone
*zone
,
329 struct nf_conntrack_tuple
*tuple
,
330 const struct nf_nat_range2
*range
,
331 const struct nf_conn
*ct
,
332 enum nf_nat_manip_type maniptype
)
334 union nf_inet_addr
*var_ipp
;
337 u32 minip
, maxip
, j
, dist
;
340 /* No IP mapping? Do nothing. */
341 if (!(range
->flags
& NF_NAT_RANGE_MAP_IPS
))
344 if (maniptype
== NF_NAT_MANIP_SRC
)
345 var_ipp
= &tuple
->src
.u3
;
347 var_ipp
= &tuple
->dst
.u3
;
349 /* Fast path: only one choice. */
350 if (nf_inet_addr_cmp(&range
->min_addr
, &range
->max_addr
)) {
351 *var_ipp
= range
->min_addr
;
355 if (nf_ct_l3num(ct
) == NFPROTO_IPV4
)
356 max
= sizeof(var_ipp
->ip
) / sizeof(u32
) - 1;
358 max
= sizeof(var_ipp
->ip6
) / sizeof(u32
) - 1;
360 /* Hashing source and destination IPs gives a fairly even
361 * spread in practice (if there are a small number of IPs
362 * involved, there usually aren't that many connections
363 * anyway). The consistency means that servers see the same
364 * client coming from the same IP (some Internet Banking sites
365 * like this), even across reboots.
367 j
= jhash2((u32
*)&tuple
->src
.u3
, sizeof(tuple
->src
.u3
) / sizeof(u32
),
368 range
->flags
& NF_NAT_RANGE_PERSISTENT
?
369 0 : (__force u32
)tuple
->dst
.u3
.all
[max
] ^ zone
->id
);
372 for (i
= 0; i
<= max
; i
++) {
373 /* If first bytes of the address are at the maximum, use the
374 * distance. Otherwise use the full range.
377 minip
= ntohl((__force __be32
)range
->min_addr
.all
[i
]);
378 maxip
= ntohl((__force __be32
)range
->max_addr
.all
[i
]);
379 dist
= maxip
- minip
+ 1;
385 var_ipp
->all
[i
] = (__force __u32
)
386 htonl(minip
+ reciprocal_scale(j
, dist
));
387 if (var_ipp
->all
[i
] != range
->max_addr
.all
[i
])
390 if (!(range
->flags
& NF_NAT_RANGE_PERSISTENT
))
391 j
^= (__force u32
)tuple
->dst
.u3
.all
[i
];
395 /* Alter the per-proto part of the tuple (depending on maniptype), to
396 * give a unique tuple in the given range if possible.
398 * Per-protocol part of tuple is initialized to the incoming packet.
400 static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple
*tuple
,
401 const struct nf_nat_range2
*range
,
402 enum nf_nat_manip_type maniptype
,
403 const struct nf_conn
*ct
)
405 unsigned int range_size
, min
, max
, i
, attempts
;
408 static const unsigned int max_attempts
= 128;
410 switch (tuple
->dst
.protonum
) {
413 /* id is same for either direction... */
414 keyptr
= &tuple
->src
.u
.icmp
.id
;
415 if (!(range
->flags
& NF_NAT_RANGE_PROTO_SPECIFIED
)) {
419 min
= ntohs(range
->min_proto
.icmp
.id
);
420 range_size
= ntohs(range
->max_proto
.icmp
.id
) -
421 ntohs(range
->min_proto
.icmp
.id
) + 1;
424 #if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE)
426 /* If there is no master conntrack we are not PPTP,
427 do not change tuples */
431 if (maniptype
== NF_NAT_MANIP_SRC
)
432 keyptr
= &tuple
->src
.u
.gre
.key
;
434 keyptr
= &tuple
->dst
.u
.gre
.key
;
436 if (!(range
->flags
& NF_NAT_RANGE_PROTO_SPECIFIED
)) {
440 min
= ntohs(range
->min_proto
.gre
.key
);
441 range_size
= ntohs(range
->max_proto
.gre
.key
) - min
+ 1;
446 case IPPROTO_UDPLITE
:
450 if (maniptype
== NF_NAT_MANIP_SRC
)
451 keyptr
= &tuple
->src
.u
.all
;
453 keyptr
= &tuple
->dst
.u
.all
;
460 /* If no range specified... */
461 if (!(range
->flags
& NF_NAT_RANGE_PROTO_SPECIFIED
)) {
462 /* If it's dst rewrite, can't change port */
463 if (maniptype
== NF_NAT_MANIP_DST
)
466 if (ntohs(*keyptr
) < 1024) {
467 /* Loose convention: >> 512 is credential passing */
468 if (ntohs(*keyptr
) < 512) {
470 range_size
= 511 - min
+ 1;
473 range_size
= 1023 - min
+ 1;
477 range_size
= 65535 - 1024 + 1;
480 min
= ntohs(range
->min_proto
.all
);
481 max
= ntohs(range
->max_proto
.all
);
482 if (unlikely(max
< min
))
484 range_size
= max
- min
+ 1;
488 if (range
->flags
& NF_NAT_RANGE_PROTO_OFFSET
)
489 off
= (ntohs(*keyptr
) - ntohs(range
->base_proto
.all
));
493 attempts
= range_size
;
494 if (attempts
> max_attempts
)
495 attempts
= max_attempts
;
497 /* We are in softirq; doing a search of the entire range risks
498 * soft lockup when all tuples are already used.
500 * If we can't find any free port from first offset, pick a new
501 * one and try again, with ever smaller search window.
504 for (i
= 0; i
< attempts
; i
++, off
++) {
505 *keyptr
= htons(min
+ off
% range_size
);
506 if (!nf_nat_used_tuple(tuple
, ct
))
510 if (attempts
>= range_size
|| attempts
< 16)
517 /* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING,
518 * we change the source to map into the range. For NF_INET_PRE_ROUTING
519 * and NF_INET_LOCAL_OUT, we change the destination to map into the
520 * range. It might not be possible to get a unique tuple, but we try.
521 * At worst (or if we race), we will end up with a final duplicate in
522 * __nf_conntrack_confirm and drop the packet. */
524 get_unique_tuple(struct nf_conntrack_tuple
*tuple
,
525 const struct nf_conntrack_tuple
*orig_tuple
,
526 const struct nf_nat_range2
*range
,
528 enum nf_nat_manip_type maniptype
)
530 const struct nf_conntrack_zone
*zone
;
531 struct net
*net
= nf_ct_net(ct
);
533 zone
= nf_ct_zone(ct
);
535 /* 1) If this srcip/proto/src-proto-part is currently mapped,
536 * and that same mapping gives a unique tuple within the given
539 * This is only required for source (ie. NAT/masq) mappings.
540 * So far, we don't do local source mappings, so multiple
541 * manips not an issue.
543 if (maniptype
== NF_NAT_MANIP_SRC
&&
544 !(range
->flags
& NF_NAT_RANGE_PROTO_RANDOM_ALL
)) {
545 /* try the original tuple first */
546 if (in_range(orig_tuple
, range
)) {
547 if (!nf_nat_used_tuple(orig_tuple
, ct
)) {
548 *tuple
= *orig_tuple
;
551 } else if (find_appropriate_src(net
, zone
,
552 orig_tuple
, tuple
, range
)) {
553 pr_debug("get_unique_tuple: Found current src map\n");
554 if (!nf_nat_used_tuple(tuple
, ct
))
559 /* 2) Select the least-used IP/proto combination in the given range */
560 *tuple
= *orig_tuple
;
561 find_best_ips_proto(zone
, tuple
, range
, ct
, maniptype
);
563 /* 3) The per-protocol part of the manip is made to map into
564 * the range to make a unique tuple.
567 /* Only bother mapping if it's not already in range and unique */
568 if (!(range
->flags
& NF_NAT_RANGE_PROTO_RANDOM_ALL
)) {
569 if (range
->flags
& NF_NAT_RANGE_PROTO_SPECIFIED
) {
570 if (!(range
->flags
& NF_NAT_RANGE_PROTO_OFFSET
) &&
571 l4proto_in_range(tuple
, maniptype
,
573 &range
->max_proto
) &&
574 (range
->min_proto
.all
== range
->max_proto
.all
||
575 !nf_nat_used_tuple(tuple
, ct
)))
577 } else if (!nf_nat_used_tuple(tuple
, ct
)) {
582 /* Last chance: get protocol to try to obtain unique tuple. */
583 nf_nat_l4proto_unique_tuple(tuple
, range
, maniptype
, ct
);
586 struct nf_conn_nat
*nf_ct_nat_ext_add(struct nf_conn
*ct
)
588 struct nf_conn_nat
*nat
= nfct_nat(ct
);
592 if (!nf_ct_is_confirmed(ct
))
593 nat
= nf_ct_ext_add(ct
, NF_CT_EXT_NAT
, GFP_ATOMIC
);
597 EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add
);
600 nf_nat_setup_info(struct nf_conn
*ct
,
601 const struct nf_nat_range2
*range
,
602 enum nf_nat_manip_type maniptype
)
604 struct net
*net
= nf_ct_net(ct
);
605 struct nf_conntrack_tuple curr_tuple
, new_tuple
;
607 /* Can't setup nat info for confirmed ct. */
608 if (nf_ct_is_confirmed(ct
))
611 WARN_ON(maniptype
!= NF_NAT_MANIP_SRC
&&
612 maniptype
!= NF_NAT_MANIP_DST
);
614 if (WARN_ON(nf_nat_initialized(ct
, maniptype
)))
617 /* What we've got will look like inverse of reply. Normally
618 * this is what is in the conntrack, except for prior
619 * manipulations (future optimization: if num_manips == 0,
620 * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
622 nf_ct_invert_tuple(&curr_tuple
,
623 &ct
->tuplehash
[IP_CT_DIR_REPLY
].tuple
);
625 get_unique_tuple(&new_tuple
, &curr_tuple
, range
, ct
, maniptype
);
627 if (!nf_ct_tuple_equal(&new_tuple
, &curr_tuple
)) {
628 struct nf_conntrack_tuple reply
;
630 /* Alter conntrack table so will recognize replies. */
631 nf_ct_invert_tuple(&reply
, &new_tuple
);
632 nf_conntrack_alter_reply(ct
, &reply
);
634 /* Non-atomic: we own this at the moment. */
635 if (maniptype
== NF_NAT_MANIP_SRC
)
636 ct
->status
|= IPS_SRC_NAT
;
638 ct
->status
|= IPS_DST_NAT
;
640 if (nfct_help(ct
) && !nfct_seqadj(ct
))
641 if (!nfct_seqadj_ext_add(ct
))
645 if (maniptype
== NF_NAT_MANIP_SRC
) {
646 unsigned int srchash
;
649 srchash
= hash_by_src(net
,
650 &ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].tuple
);
651 lock
= &nf_nat_locks
[srchash
% CONNTRACK_LOCKS
];
653 hlist_add_head_rcu(&ct
->nat_bysource
,
654 &nf_nat_bysource
[srchash
]);
655 spin_unlock_bh(lock
);
659 if (maniptype
== NF_NAT_MANIP_DST
)
660 ct
->status
|= IPS_DST_NAT_DONE
;
662 ct
->status
|= IPS_SRC_NAT_DONE
;
666 EXPORT_SYMBOL(nf_nat_setup_info
);
669 __nf_nat_alloc_null_binding(struct nf_conn
*ct
, enum nf_nat_manip_type manip
)
671 /* Force range to this IP; let proto decide mapping for
672 * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
673 * Use reply in case it's already been mangled (eg local packet).
675 union nf_inet_addr ip
=
676 (manip
== NF_NAT_MANIP_SRC
?
677 ct
->tuplehash
[IP_CT_DIR_REPLY
].tuple
.dst
.u3
:
678 ct
->tuplehash
[IP_CT_DIR_REPLY
].tuple
.src
.u3
);
679 struct nf_nat_range2 range
= {
680 .flags
= NF_NAT_RANGE_MAP_IPS
,
684 return nf_nat_setup_info(ct
, &range
, manip
);
688 nf_nat_alloc_null_binding(struct nf_conn
*ct
, unsigned int hooknum
)
690 return __nf_nat_alloc_null_binding(ct
, HOOK2MANIP(hooknum
));
692 EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding
);
694 /* Do packet manipulations according to nf_nat_setup_info. */
695 unsigned int nf_nat_packet(struct nf_conn
*ct
,
696 enum ip_conntrack_info ctinfo
,
697 unsigned int hooknum
,
700 enum nf_nat_manip_type mtype
= HOOK2MANIP(hooknum
);
701 enum ip_conntrack_dir dir
= CTINFO2DIR(ctinfo
);
702 unsigned int verdict
= NF_ACCEPT
;
703 unsigned long statusbit
;
705 if (mtype
== NF_NAT_MANIP_SRC
)
706 statusbit
= IPS_SRC_NAT
;
708 statusbit
= IPS_DST_NAT
;
710 /* Invert if this is reply dir. */
711 if (dir
== IP_CT_DIR_REPLY
)
712 statusbit
^= IPS_NAT_MASK
;
714 /* Non-atomic: these bits don't change. */
715 if (ct
->status
& statusbit
)
716 verdict
= nf_nat_manip_pkt(skb
, ct
, mtype
, dir
);
720 EXPORT_SYMBOL_GPL(nf_nat_packet
);
723 nf_nat_inet_fn(void *priv
, struct sk_buff
*skb
,
724 const struct nf_hook_state
*state
)
727 enum ip_conntrack_info ctinfo
;
728 struct nf_conn_nat
*nat
;
729 /* maniptype == SRC for postrouting. */
730 enum nf_nat_manip_type maniptype
= HOOK2MANIP(state
->hook
);
732 ct
= nf_ct_get(skb
, &ctinfo
);
733 /* Can't track? It's not due to stress, or conntrack would
734 * have dropped it. Hence it's the user's responsibilty to
735 * packet filter it out, or implement conntrack/NAT for that
745 case IP_CT_RELATED_REPLY
:
746 /* Only ICMPs can be IP_CT_IS_REPLY. Fallthrough */
748 /* Seen it before? This can happen for loopback, retrans,
751 if (!nf_nat_initialized(ct
, maniptype
)) {
752 struct nf_nat_lookup_hook_priv
*lpriv
= priv
;
753 struct nf_hook_entries
*e
= rcu_dereference(lpriv
->entries
);
760 for (i
= 0; i
< e
->num_hook_entries
; i
++) {
761 ret
= e
->hooks
[i
].hook(e
->hooks
[i
].priv
, skb
,
763 if (ret
!= NF_ACCEPT
)
765 if (nf_nat_initialized(ct
, maniptype
))
769 ret
= nf_nat_alloc_null_binding(ct
, state
->hook
);
770 if (ret
!= NF_ACCEPT
)
773 pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n",
774 maniptype
== NF_NAT_MANIP_SRC
? "SRC" : "DST",
776 if (nf_nat_oif_changed(state
->hook
, ctinfo
, nat
,
783 WARN_ON(ctinfo
!= IP_CT_ESTABLISHED
&&
784 ctinfo
!= IP_CT_ESTABLISHED_REPLY
);
785 if (nf_nat_oif_changed(state
->hook
, ctinfo
, nat
, state
->out
))
789 return nf_nat_packet(ct
, ctinfo
, state
->hook
, skb
);
792 nf_ct_kill_acct(ct
, ctinfo
, skb
);
795 EXPORT_SYMBOL_GPL(nf_nat_inet_fn
);
797 struct nf_nat_proto_clean
{
802 /* kill conntracks with affected NAT section */
803 static int nf_nat_proto_remove(struct nf_conn
*i
, void *data
)
805 const struct nf_nat_proto_clean
*clean
= data
;
807 if ((clean
->l3proto
&& nf_ct_l3num(i
) != clean
->l3proto
) ||
808 (clean
->l4proto
&& nf_ct_protonum(i
) != clean
->l4proto
))
811 return i
->status
& IPS_NAT_MASK
? 1 : 0;
814 static void __nf_nat_cleanup_conntrack(struct nf_conn
*ct
)
818 h
= hash_by_src(nf_ct_net(ct
), &ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].tuple
);
819 spin_lock_bh(&nf_nat_locks
[h
% CONNTRACK_LOCKS
]);
820 hlist_del_rcu(&ct
->nat_bysource
);
821 spin_unlock_bh(&nf_nat_locks
[h
% CONNTRACK_LOCKS
]);
824 static int nf_nat_proto_clean(struct nf_conn
*ct
, void *data
)
826 if (nf_nat_proto_remove(ct
, data
))
829 /* This module is being removed and conntrack has nat null binding.
830 * Remove it from bysource hash, as the table will be freed soon.
832 * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack()
833 * will delete entry from already-freed table.
835 if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT
, &ct
->status
))
836 __nf_nat_cleanup_conntrack(ct
);
838 /* don't delete conntrack. Although that would make things a lot
839 * simpler, we'd end up flushing all conntracks on nat rmmod.
844 /* No one using conntrack by the time this called. */
845 static void nf_nat_cleanup_conntrack(struct nf_conn
*ct
)
847 if (ct
->status
& IPS_SRC_NAT_DONE
)
848 __nf_nat_cleanup_conntrack(ct
);
851 static struct nf_ct_ext_type nat_extend __read_mostly
= {
852 .len
= sizeof(struct nf_conn_nat
),
853 .align
= __alignof__(struct nf_conn_nat
),
854 .destroy
= nf_nat_cleanup_conntrack
,
858 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
860 #include <linux/netfilter/nfnetlink.h>
861 #include <linux/netfilter/nfnetlink_conntrack.h>
863 static const struct nla_policy protonat_nla_policy
[CTA_PROTONAT_MAX
+1] = {
864 [CTA_PROTONAT_PORT_MIN
] = { .type
= NLA_U16
},
865 [CTA_PROTONAT_PORT_MAX
] = { .type
= NLA_U16
},
868 static int nf_nat_l4proto_nlattr_to_range(struct nlattr
*tb
[],
869 struct nf_nat_range2
*range
)
871 if (tb
[CTA_PROTONAT_PORT_MIN
]) {
872 range
->min_proto
.all
= nla_get_be16(tb
[CTA_PROTONAT_PORT_MIN
]);
873 range
->max_proto
.all
= range
->min_proto
.all
;
874 range
->flags
|= NF_NAT_RANGE_PROTO_SPECIFIED
;
876 if (tb
[CTA_PROTONAT_PORT_MAX
]) {
877 range
->max_proto
.all
= nla_get_be16(tb
[CTA_PROTONAT_PORT_MAX
]);
878 range
->flags
|= NF_NAT_RANGE_PROTO_SPECIFIED
;
883 static int nfnetlink_parse_nat_proto(struct nlattr
*attr
,
884 const struct nf_conn
*ct
,
885 struct nf_nat_range2
*range
)
887 struct nlattr
*tb
[CTA_PROTONAT_MAX
+1];
890 err
= nla_parse_nested_deprecated(tb
, CTA_PROTONAT_MAX
, attr
,
891 protonat_nla_policy
, NULL
);
895 return nf_nat_l4proto_nlattr_to_range(tb
, range
);
898 static const struct nla_policy nat_nla_policy
[CTA_NAT_MAX
+1] = {
899 [CTA_NAT_V4_MINIP
] = { .type
= NLA_U32
},
900 [CTA_NAT_V4_MAXIP
] = { .type
= NLA_U32
},
901 [CTA_NAT_V6_MINIP
] = { .len
= sizeof(struct in6_addr
) },
902 [CTA_NAT_V6_MAXIP
] = { .len
= sizeof(struct in6_addr
) },
903 [CTA_NAT_PROTO
] = { .type
= NLA_NESTED
},
906 static int nf_nat_ipv4_nlattr_to_range(struct nlattr
*tb
[],
907 struct nf_nat_range2
*range
)
909 if (tb
[CTA_NAT_V4_MINIP
]) {
910 range
->min_addr
.ip
= nla_get_be32(tb
[CTA_NAT_V4_MINIP
]);
911 range
->flags
|= NF_NAT_RANGE_MAP_IPS
;
914 if (tb
[CTA_NAT_V4_MAXIP
])
915 range
->max_addr
.ip
= nla_get_be32(tb
[CTA_NAT_V4_MAXIP
]);
917 range
->max_addr
.ip
= range
->min_addr
.ip
;
922 static int nf_nat_ipv6_nlattr_to_range(struct nlattr
*tb
[],
923 struct nf_nat_range2
*range
)
925 if (tb
[CTA_NAT_V6_MINIP
]) {
926 nla_memcpy(&range
->min_addr
.ip6
, tb
[CTA_NAT_V6_MINIP
],
927 sizeof(struct in6_addr
));
928 range
->flags
|= NF_NAT_RANGE_MAP_IPS
;
931 if (tb
[CTA_NAT_V6_MAXIP
])
932 nla_memcpy(&range
->max_addr
.ip6
, tb
[CTA_NAT_V6_MAXIP
],
933 sizeof(struct in6_addr
));
935 range
->max_addr
= range
->min_addr
;
941 nfnetlink_parse_nat(const struct nlattr
*nat
,
942 const struct nf_conn
*ct
, struct nf_nat_range2
*range
)
944 struct nlattr
*tb
[CTA_NAT_MAX
+1];
947 memset(range
, 0, sizeof(*range
));
949 err
= nla_parse_nested_deprecated(tb
, CTA_NAT_MAX
, nat
,
950 nat_nla_policy
, NULL
);
954 switch (nf_ct_l3num(ct
)) {
956 err
= nf_nat_ipv4_nlattr_to_range(tb
, range
);
959 err
= nf_nat_ipv6_nlattr_to_range(tb
, range
);
962 err
= -EPROTONOSUPPORT
;
969 if (!tb
[CTA_NAT_PROTO
])
972 return nfnetlink_parse_nat_proto(tb
[CTA_NAT_PROTO
], ct
, range
);
975 /* This function is called under rcu_read_lock() */
977 nfnetlink_parse_nat_setup(struct nf_conn
*ct
,
978 enum nf_nat_manip_type manip
,
979 const struct nlattr
*attr
)
981 struct nf_nat_range2 range
;
984 /* Should not happen, restricted to creating new conntracks
987 if (WARN_ON_ONCE(nf_nat_initialized(ct
, manip
)))
990 /* No NAT information has been passed, allocate the null-binding */
992 return __nf_nat_alloc_null_binding(ct
, manip
) == NF_DROP
? -ENOMEM
: 0;
994 err
= nfnetlink_parse_nat(attr
, ct
, &range
);
998 return nf_nat_setup_info(ct
, &range
, manip
) == NF_DROP
? -ENOMEM
: 0;
1002 nfnetlink_parse_nat_setup(struct nf_conn
*ct
,
1003 enum nf_nat_manip_type manip
,
1004 const struct nlattr
*attr
)
1010 static struct nf_ct_helper_expectfn follow_master_nat
= {
1011 .name
= "nat-follow-master",
1012 .expectfn
= nf_nat_follow_master
,
1015 int nf_nat_register_fn(struct net
*net
, u8 pf
, const struct nf_hook_ops
*ops
,
1016 const struct nf_hook_ops
*orig_nat_ops
, unsigned int ops_count
)
1018 struct nat_net
*nat_net
= net_generic(net
, nat_net_id
);
1019 struct nf_nat_hooks_net
*nat_proto_net
;
1020 struct nf_nat_lookup_hook_priv
*priv
;
1021 unsigned int hooknum
= ops
->hooknum
;
1022 struct nf_hook_ops
*nat_ops
;
1025 if (WARN_ON_ONCE(pf
>= ARRAY_SIZE(nat_net
->nat_proto_net
)))
1028 nat_proto_net
= &nat_net
->nat_proto_net
[pf
];
1030 for (i
= 0; i
< ops_count
; i
++) {
1031 if (orig_nat_ops
[i
].hooknum
== hooknum
) {
1037 if (WARN_ON_ONCE(i
== ops_count
))
1040 mutex_lock(&nf_nat_proto_mutex
);
1041 if (!nat_proto_net
->nat_hook_ops
) {
1042 WARN_ON(nat_proto_net
->users
!= 0);
1044 nat_ops
= kmemdup(orig_nat_ops
, sizeof(*orig_nat_ops
) * ops_count
, GFP_KERNEL
);
1046 mutex_unlock(&nf_nat_proto_mutex
);
1050 for (i
= 0; i
< ops_count
; i
++) {
1051 priv
= kzalloc(sizeof(*priv
), GFP_KERNEL
);
1053 nat_ops
[i
].priv
= priv
;
1056 mutex_unlock(&nf_nat_proto_mutex
);
1058 kfree(nat_ops
[--i
].priv
);
1063 ret
= nf_register_net_hooks(net
, nat_ops
, ops_count
);
1065 mutex_unlock(&nf_nat_proto_mutex
);
1066 for (i
= 0; i
< ops_count
; i
++)
1067 kfree(nat_ops
[i
].priv
);
1072 nat_proto_net
->nat_hook_ops
= nat_ops
;
1075 nat_ops
= nat_proto_net
->nat_hook_ops
;
1076 priv
= nat_ops
[hooknum
].priv
;
1077 if (WARN_ON_ONCE(!priv
)) {
1078 mutex_unlock(&nf_nat_proto_mutex
);
1082 ret
= nf_hook_entries_insert_raw(&priv
->entries
, ops
);
1084 nat_proto_net
->users
++;
1086 mutex_unlock(&nf_nat_proto_mutex
);
1090 void nf_nat_unregister_fn(struct net
*net
, u8 pf
, const struct nf_hook_ops
*ops
,
1091 unsigned int ops_count
)
1093 struct nat_net
*nat_net
= net_generic(net
, nat_net_id
);
1094 struct nf_nat_hooks_net
*nat_proto_net
;
1095 struct nf_nat_lookup_hook_priv
*priv
;
1096 struct nf_hook_ops
*nat_ops
;
1097 int hooknum
= ops
->hooknum
;
1100 if (pf
>= ARRAY_SIZE(nat_net
->nat_proto_net
))
1103 nat_proto_net
= &nat_net
->nat_proto_net
[pf
];
1105 mutex_lock(&nf_nat_proto_mutex
);
1106 if (WARN_ON(nat_proto_net
->users
== 0))
1109 nat_proto_net
->users
--;
1111 nat_ops
= nat_proto_net
->nat_hook_ops
;
1112 for (i
= 0; i
< ops_count
; i
++) {
1113 if (nat_ops
[i
].hooknum
== hooknum
) {
1118 if (WARN_ON_ONCE(i
== ops_count
))
1120 priv
= nat_ops
[hooknum
].priv
;
1121 nf_hook_entries_delete_raw(&priv
->entries
, ops
);
1123 if (nat_proto_net
->users
== 0) {
1124 nf_unregister_net_hooks(net
, nat_ops
, ops_count
);
1126 for (i
= 0; i
< ops_count
; i
++) {
1127 priv
= nat_ops
[i
].priv
;
1128 kfree_rcu(priv
, rcu_head
);
1131 nat_proto_net
->nat_hook_ops
= NULL
;
1135 mutex_unlock(&nf_nat_proto_mutex
);
1138 static struct pernet_operations nat_net_ops
= {
1140 .size
= sizeof(struct nat_net
),
1143 static struct nf_nat_hook nat_hook
= {
1144 .parse_nat_setup
= nfnetlink_parse_nat_setup
,
1146 .decode_session
= __nf_nat_decode_session
,
1148 .manip_pkt
= nf_nat_manip_pkt
,
1151 static int __init
nf_nat_init(void)
1155 /* Leave them the same for the moment. */
1156 nf_nat_htable_size
= nf_conntrack_htable_size
;
1157 if (nf_nat_htable_size
< CONNTRACK_LOCKS
)
1158 nf_nat_htable_size
= CONNTRACK_LOCKS
;
1160 nf_nat_bysource
= nf_ct_alloc_hashtable(&nf_nat_htable_size
, 0);
1161 if (!nf_nat_bysource
)
1164 ret
= nf_ct_extend_register(&nat_extend
);
1166 kvfree(nf_nat_bysource
);
1167 pr_err("Unable to register extension\n");
1171 for (i
= 0; i
< CONNTRACK_LOCKS
; i
++)
1172 spin_lock_init(&nf_nat_locks
[i
]);
1174 ret
= register_pernet_subsys(&nat_net_ops
);
1176 nf_ct_extend_unregister(&nat_extend
);
1177 kvfree(nf_nat_bysource
);
1181 nf_ct_helper_expectfn_register(&follow_master_nat
);
1183 WARN_ON(nf_nat_hook
!= NULL
);
1184 RCU_INIT_POINTER(nf_nat_hook
, &nat_hook
);
1189 static void __exit
nf_nat_cleanup(void)
1191 struct nf_nat_proto_clean clean
= {};
1193 nf_ct_iterate_destroy(nf_nat_proto_clean
, &clean
);
1195 nf_ct_extend_unregister(&nat_extend
);
1196 nf_ct_helper_expectfn_unregister(&follow_master_nat
);
1197 RCU_INIT_POINTER(nf_nat_hook
, NULL
);
1200 kvfree(nf_nat_bysource
);
1201 unregister_pernet_subsys(&nat_net_ops
);
1204 MODULE_LICENSE("GPL");
1206 module_init(nf_nat_init
);
1207 module_exit(nf_nat_cleanup
);