2 * (C) 1999-2001 Paul `Rusty' Russell
3 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
4 * (C) 2011 Patrick McHardy <kaber@trash.net>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/timer.h>
16 #include <linux/skbuff.h>
17 #include <linux/gfp.h>
19 #include <linux/jhash.h>
20 #include <linux/rtnetlink.h>
22 #include <net/netfilter/nf_conntrack.h>
23 #include <net/netfilter/nf_conntrack_core.h>
24 #include <net/netfilter/nf_nat.h>
25 #include <net/netfilter/nf_nat_helper.h>
26 #include <net/netfilter/nf_conntrack_helper.h>
27 #include <net/netfilter/nf_conntrack_seqadj.h>
28 #include <net/netfilter/nf_conntrack_zones.h>
29 #include <linux/netfilter/nf_nat.h>
31 #include "nf_internals.h"
33 static spinlock_t nf_nat_locks
[CONNTRACK_LOCKS
];
35 static DEFINE_MUTEX(nf_nat_proto_mutex
);
36 static unsigned int nat_net_id __read_mostly
;
38 static struct hlist_head
*nf_nat_bysource __read_mostly
;
39 static unsigned int nf_nat_htable_size __read_mostly
;
40 static unsigned int nf_nat_hash_rnd __read_mostly
;
42 struct nf_nat_lookup_hook_priv
{
43 struct nf_hook_entries __rcu
*entries
;
45 struct rcu_head rcu_head
;
48 struct nf_nat_hooks_net
{
49 struct nf_hook_ops
*nat_hook_ops
;
54 struct nf_nat_hooks_net nat_proto_net
[NFPROTO_NUMPROTO
];
58 static void nf_nat_ipv4_decode_session(struct sk_buff
*skb
,
59 const struct nf_conn
*ct
,
60 enum ip_conntrack_dir dir
,
61 unsigned long statusbit
,
64 const struct nf_conntrack_tuple
*t
= &ct
->tuplehash
[dir
].tuple
;
65 struct flowi4
*fl4
= &fl
->u
.ip4
;
67 if (ct
->status
& statusbit
) {
68 fl4
->daddr
= t
->dst
.u3
.ip
;
69 if (t
->dst
.protonum
== IPPROTO_TCP
||
70 t
->dst
.protonum
== IPPROTO_UDP
||
71 t
->dst
.protonum
== IPPROTO_UDPLITE
||
72 t
->dst
.protonum
== IPPROTO_DCCP
||
73 t
->dst
.protonum
== IPPROTO_SCTP
)
74 fl4
->fl4_dport
= t
->dst
.u
.all
;
77 statusbit
^= IPS_NAT_MASK
;
79 if (ct
->status
& statusbit
) {
80 fl4
->saddr
= t
->src
.u3
.ip
;
81 if (t
->dst
.protonum
== IPPROTO_TCP
||
82 t
->dst
.protonum
== IPPROTO_UDP
||
83 t
->dst
.protonum
== IPPROTO_UDPLITE
||
84 t
->dst
.protonum
== IPPROTO_DCCP
||
85 t
->dst
.protonum
== IPPROTO_SCTP
)
86 fl4
->fl4_sport
= t
->src
.u
.all
;
90 static void nf_nat_ipv6_decode_session(struct sk_buff
*skb
,
91 const struct nf_conn
*ct
,
92 enum ip_conntrack_dir dir
,
93 unsigned long statusbit
,
96 #if IS_ENABLED(CONFIG_IPV6)
97 const struct nf_conntrack_tuple
*t
= &ct
->tuplehash
[dir
].tuple
;
98 struct flowi6
*fl6
= &fl
->u
.ip6
;
100 if (ct
->status
& statusbit
) {
101 fl6
->daddr
= t
->dst
.u3
.in6
;
102 if (t
->dst
.protonum
== IPPROTO_TCP
||
103 t
->dst
.protonum
== IPPROTO_UDP
||
104 t
->dst
.protonum
== IPPROTO_UDPLITE
||
105 t
->dst
.protonum
== IPPROTO_DCCP
||
106 t
->dst
.protonum
== IPPROTO_SCTP
)
107 fl6
->fl6_dport
= t
->dst
.u
.all
;
110 statusbit
^= IPS_NAT_MASK
;
112 if (ct
->status
& statusbit
) {
113 fl6
->saddr
= t
->src
.u3
.in6
;
114 if (t
->dst
.protonum
== IPPROTO_TCP
||
115 t
->dst
.protonum
== IPPROTO_UDP
||
116 t
->dst
.protonum
== IPPROTO_UDPLITE
||
117 t
->dst
.protonum
== IPPROTO_DCCP
||
118 t
->dst
.protonum
== IPPROTO_SCTP
)
119 fl6
->fl6_sport
= t
->src
.u
.all
;
124 static void __nf_nat_decode_session(struct sk_buff
*skb
, struct flowi
*fl
)
126 const struct nf_conn
*ct
;
127 enum ip_conntrack_info ctinfo
;
128 enum ip_conntrack_dir dir
;
129 unsigned long statusbit
;
132 ct
= nf_ct_get(skb
, &ctinfo
);
136 family
= nf_ct_l3num(ct
);
137 dir
= CTINFO2DIR(ctinfo
);
138 if (dir
== IP_CT_DIR_ORIGINAL
)
139 statusbit
= IPS_DST_NAT
;
141 statusbit
= IPS_SRC_NAT
;
145 nf_nat_ipv4_decode_session(skb
, ct
, dir
, statusbit
, fl
);
148 nf_nat_ipv6_decode_session(skb
, ct
, dir
, statusbit
, fl
);
153 int nf_xfrm_me_harder(struct net
*net
, struct sk_buff
*skb
, unsigned int family
)
157 struct dst_entry
*dst
;
158 struct sock
*sk
= skb
->sk
;
161 err
= xfrm_decode_session(skb
, &fl
, family
);
167 dst
= ((struct xfrm_dst
*)dst
)->route
;
168 if (!dst_hold_safe(dst
))
169 return -EHOSTUNREACH
;
171 if (sk
&& !net_eq(net
, sock_net(sk
)))
174 dst
= xfrm_lookup(net
, dst
, &fl
, sk
, 0);
179 skb_dst_set(skb
, dst
);
181 /* Change in oif may mean change in hh_len. */
182 hh_len
= skb_dst(skb
)->dev
->hard_header_len
;
183 if (skb_headroom(skb
) < hh_len
&&
184 pskb_expand_head(skb
, hh_len
- skb_headroom(skb
), 0, GFP_ATOMIC
))
188 EXPORT_SYMBOL(nf_xfrm_me_harder
);
189 #endif /* CONFIG_XFRM */
191 /* We keep an extra hash for each conntrack, for fast searching. */
193 hash_by_src(const struct net
*n
, const struct nf_conntrack_tuple
*tuple
)
197 get_random_once(&nf_nat_hash_rnd
, sizeof(nf_nat_hash_rnd
));
199 /* Original src, to ensure we map it consistently if poss. */
200 hash
= jhash2((u32
*)&tuple
->src
, sizeof(tuple
->src
) / sizeof(u32
),
201 tuple
->dst
.protonum
^ nf_nat_hash_rnd
^ net_hash_mix(n
));
203 return reciprocal_scale(hash
, nf_nat_htable_size
);
206 /* Is this tuple already taken? (not by us) */
208 nf_nat_used_tuple(const struct nf_conntrack_tuple
*tuple
,
209 const struct nf_conn
*ignored_conntrack
)
211 /* Conntrack tracking doesn't keep track of outgoing tuples; only
212 * incoming ones. NAT means they don't have a fixed mapping,
213 * so we invert the tuple and look for the incoming reply.
215 * We could keep a separate hash if this proves too slow.
217 struct nf_conntrack_tuple reply
;
219 nf_ct_invert_tuple(&reply
, tuple
);
220 return nf_conntrack_tuple_taken(&reply
, ignored_conntrack
);
223 static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple
*t
,
224 const struct nf_nat_range2
*range
)
226 if (t
->src
.l3num
== NFPROTO_IPV4
)
227 return ntohl(t
->src
.u3
.ip
) >= ntohl(range
->min_addr
.ip
) &&
228 ntohl(t
->src
.u3
.ip
) <= ntohl(range
->max_addr
.ip
);
230 return ipv6_addr_cmp(&t
->src
.u3
.in6
, &range
->min_addr
.in6
) >= 0 &&
231 ipv6_addr_cmp(&t
->src
.u3
.in6
, &range
->max_addr
.in6
) <= 0;
234 /* Is the manipable part of the tuple between min and max incl? */
235 static bool l4proto_in_range(const struct nf_conntrack_tuple
*tuple
,
236 enum nf_nat_manip_type maniptype
,
237 const union nf_conntrack_man_proto
*min
,
238 const union nf_conntrack_man_proto
*max
)
242 switch (tuple
->dst
.protonum
) {
245 return ntohs(tuple
->src
.u
.icmp
.id
) >= ntohs(min
->icmp
.id
) &&
246 ntohs(tuple
->src
.u
.icmp
.id
) <= ntohs(max
->icmp
.id
);
247 case IPPROTO_GRE
: /* all fall though */
250 case IPPROTO_UDPLITE
:
253 if (maniptype
== NF_NAT_MANIP_SRC
)
254 port
= tuple
->src
.u
.all
;
256 port
= tuple
->dst
.u
.all
;
258 return ntohs(port
) >= ntohs(min
->all
) &&
259 ntohs(port
) <= ntohs(max
->all
);
265 /* If we source map this tuple so reply looks like reply_tuple, will
266 * that meet the constraints of range.
268 static int in_range(const struct nf_conntrack_tuple
*tuple
,
269 const struct nf_nat_range2
*range
)
271 /* If we are supposed to map IPs, then we must be in the
272 * range specified, otherwise let this drag us onto a new src IP.
274 if (range
->flags
& NF_NAT_RANGE_MAP_IPS
&&
275 !nf_nat_inet_in_range(tuple
, range
))
278 if (!(range
->flags
& NF_NAT_RANGE_PROTO_SPECIFIED
))
281 return l4proto_in_range(tuple
, NF_NAT_MANIP_SRC
,
282 &range
->min_proto
, &range
->max_proto
);
286 same_src(const struct nf_conn
*ct
,
287 const struct nf_conntrack_tuple
*tuple
)
289 const struct nf_conntrack_tuple
*t
;
291 t
= &ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].tuple
;
292 return (t
->dst
.protonum
== tuple
->dst
.protonum
&&
293 nf_inet_addr_cmp(&t
->src
.u3
, &tuple
->src
.u3
) &&
294 t
->src
.u
.all
== tuple
->src
.u
.all
);
297 /* Only called for SRC manip */
299 find_appropriate_src(struct net
*net
,
300 const struct nf_conntrack_zone
*zone
,
301 const struct nf_conntrack_tuple
*tuple
,
302 struct nf_conntrack_tuple
*result
,
303 const struct nf_nat_range2
*range
)
305 unsigned int h
= hash_by_src(net
, tuple
);
306 const struct nf_conn
*ct
;
308 hlist_for_each_entry_rcu(ct
, &nf_nat_bysource
[h
], nat_bysource
) {
309 if (same_src(ct
, tuple
) &&
310 net_eq(net
, nf_ct_net(ct
)) &&
311 nf_ct_zone_equal(ct
, zone
, IP_CT_DIR_ORIGINAL
)) {
312 /* Copy source part from reply tuple. */
313 nf_ct_invert_tuple(result
,
314 &ct
->tuplehash
[IP_CT_DIR_REPLY
].tuple
);
315 result
->dst
= tuple
->dst
;
317 if (in_range(result
, range
))
324 /* For [FUTURE] fragmentation handling, we want the least-used
325 * src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
326 * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
327 * 1-65535, we don't do pro-rata allocation based on ports; we choose
328 * the ip with the lowest src-ip/dst-ip/proto usage.
331 find_best_ips_proto(const struct nf_conntrack_zone
*zone
,
332 struct nf_conntrack_tuple
*tuple
,
333 const struct nf_nat_range2
*range
,
334 const struct nf_conn
*ct
,
335 enum nf_nat_manip_type maniptype
)
337 union nf_inet_addr
*var_ipp
;
340 u32 minip
, maxip
, j
, dist
;
343 /* No IP mapping? Do nothing. */
344 if (!(range
->flags
& NF_NAT_RANGE_MAP_IPS
))
347 if (maniptype
== NF_NAT_MANIP_SRC
)
348 var_ipp
= &tuple
->src
.u3
;
350 var_ipp
= &tuple
->dst
.u3
;
352 /* Fast path: only one choice. */
353 if (nf_inet_addr_cmp(&range
->min_addr
, &range
->max_addr
)) {
354 *var_ipp
= range
->min_addr
;
358 if (nf_ct_l3num(ct
) == NFPROTO_IPV4
)
359 max
= sizeof(var_ipp
->ip
) / sizeof(u32
) - 1;
361 max
= sizeof(var_ipp
->ip6
) / sizeof(u32
) - 1;
363 /* Hashing source and destination IPs gives a fairly even
364 * spread in practice (if there are a small number of IPs
365 * involved, there usually aren't that many connections
366 * anyway). The consistency means that servers see the same
367 * client coming from the same IP (some Internet Banking sites
368 * like this), even across reboots.
370 j
= jhash2((u32
*)&tuple
->src
.u3
, sizeof(tuple
->src
.u3
) / sizeof(u32
),
371 range
->flags
& NF_NAT_RANGE_PERSISTENT
?
372 0 : (__force u32
)tuple
->dst
.u3
.all
[max
] ^ zone
->id
);
375 for (i
= 0; i
<= max
; i
++) {
376 /* If first bytes of the address are at the maximum, use the
377 * distance. Otherwise use the full range.
380 minip
= ntohl((__force __be32
)range
->min_addr
.all
[i
]);
381 maxip
= ntohl((__force __be32
)range
->max_addr
.all
[i
]);
382 dist
= maxip
- minip
+ 1;
388 var_ipp
->all
[i
] = (__force __u32
)
389 htonl(minip
+ reciprocal_scale(j
, dist
));
390 if (var_ipp
->all
[i
] != range
->max_addr
.all
[i
])
393 if (!(range
->flags
& NF_NAT_RANGE_PERSISTENT
))
394 j
^= (__force u32
)tuple
->dst
.u3
.all
[i
];
398 /* Alter the per-proto part of the tuple (depending on maniptype), to
399 * give a unique tuple in the given range if possible.
401 * Per-protocol part of tuple is initialized to the incoming packet.
403 static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple
*tuple
,
404 const struct nf_nat_range2
*range
,
405 enum nf_nat_manip_type maniptype
,
406 const struct nf_conn
*ct
)
408 unsigned int range_size
, min
, max
, i
, attempts
;
411 static const unsigned int max_attempts
= 128;
413 switch (tuple
->dst
.protonum
) {
414 case IPPROTO_ICMP
: /* fallthrough */
416 /* id is same for either direction... */
417 keyptr
= &tuple
->src
.u
.icmp
.id
;
418 if (!(range
->flags
& NF_NAT_RANGE_PROTO_SPECIFIED
)) {
422 min
= ntohs(range
->min_proto
.icmp
.id
);
423 range_size
= ntohs(range
->max_proto
.icmp
.id
) -
424 ntohs(range
->min_proto
.icmp
.id
) + 1;
427 #if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE)
429 /* If there is no master conntrack we are not PPTP,
430 do not change tuples */
434 if (maniptype
== NF_NAT_MANIP_SRC
)
435 keyptr
= &tuple
->src
.u
.gre
.key
;
437 keyptr
= &tuple
->dst
.u
.gre
.key
;
439 if (!(range
->flags
& NF_NAT_RANGE_PROTO_SPECIFIED
)) {
443 min
= ntohs(range
->min_proto
.gre
.key
);
444 range_size
= ntohs(range
->max_proto
.gre
.key
) - min
+ 1;
448 case IPPROTO_UDP
: /* fallthrough */
449 case IPPROTO_UDPLITE
: /* fallthrough */
450 case IPPROTO_TCP
: /* fallthrough */
451 case IPPROTO_SCTP
: /* fallthrough */
452 case IPPROTO_DCCP
: /* fallthrough */
453 if (maniptype
== NF_NAT_MANIP_SRC
)
454 keyptr
= &tuple
->src
.u
.all
;
456 keyptr
= &tuple
->dst
.u
.all
;
463 /* If no range specified... */
464 if (!(range
->flags
& NF_NAT_RANGE_PROTO_SPECIFIED
)) {
465 /* If it's dst rewrite, can't change port */
466 if (maniptype
== NF_NAT_MANIP_DST
)
469 if (ntohs(*keyptr
) < 1024) {
470 /* Loose convention: >> 512 is credential passing */
471 if (ntohs(*keyptr
) < 512) {
473 range_size
= 511 - min
+ 1;
476 range_size
= 1023 - min
+ 1;
480 range_size
= 65535 - 1024 + 1;
483 min
= ntohs(range
->min_proto
.all
);
484 max
= ntohs(range
->max_proto
.all
);
485 if (unlikely(max
< min
))
487 range_size
= max
- min
+ 1;
491 if (range
->flags
& NF_NAT_RANGE_PROTO_OFFSET
)
492 off
= (ntohs(*keyptr
) - ntohs(range
->base_proto
.all
));
496 attempts
= range_size
;
497 if (attempts
> max_attempts
)
498 attempts
= max_attempts
;
500 /* We are in softirq; doing a search of the entire range risks
501 * soft lockup when all tuples are already used.
503 * If we can't find any free port from first offset, pick a new
504 * one and try again, with ever smaller search window.
507 for (i
= 0; i
< attempts
; i
++, off
++) {
508 *keyptr
= htons(min
+ off
% range_size
);
509 if (!nf_nat_used_tuple(tuple
, ct
))
513 if (attempts
>= range_size
|| attempts
< 16)
520 /* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING,
521 * we change the source to map into the range. For NF_INET_PRE_ROUTING
522 * and NF_INET_LOCAL_OUT, we change the destination to map into the
523 * range. It might not be possible to get a unique tuple, but we try.
524 * At worst (or if we race), we will end up with a final duplicate in
525 * __ip_conntrack_confirm and drop the packet. */
527 get_unique_tuple(struct nf_conntrack_tuple
*tuple
,
528 const struct nf_conntrack_tuple
*orig_tuple
,
529 const struct nf_nat_range2
*range
,
531 enum nf_nat_manip_type maniptype
)
533 const struct nf_conntrack_zone
*zone
;
534 struct net
*net
= nf_ct_net(ct
);
536 zone
= nf_ct_zone(ct
);
538 /* 1) If this srcip/proto/src-proto-part is currently mapped,
539 * and that same mapping gives a unique tuple within the given
542 * This is only required for source (ie. NAT/masq) mappings.
543 * So far, we don't do local source mappings, so multiple
544 * manips not an issue.
546 if (maniptype
== NF_NAT_MANIP_SRC
&&
547 !(range
->flags
& NF_NAT_RANGE_PROTO_RANDOM_ALL
)) {
548 /* try the original tuple first */
549 if (in_range(orig_tuple
, range
)) {
550 if (!nf_nat_used_tuple(orig_tuple
, ct
)) {
551 *tuple
= *orig_tuple
;
554 } else if (find_appropriate_src(net
, zone
,
555 orig_tuple
, tuple
, range
)) {
556 pr_debug("get_unique_tuple: Found current src map\n");
557 if (!nf_nat_used_tuple(tuple
, ct
))
562 /* 2) Select the least-used IP/proto combination in the given range */
563 *tuple
= *orig_tuple
;
564 find_best_ips_proto(zone
, tuple
, range
, ct
, maniptype
);
566 /* 3) The per-protocol part of the manip is made to map into
567 * the range to make a unique tuple.
570 /* Only bother mapping if it's not already in range and unique */
571 if (!(range
->flags
& NF_NAT_RANGE_PROTO_RANDOM_ALL
)) {
572 if (range
->flags
& NF_NAT_RANGE_PROTO_SPECIFIED
) {
573 if (!(range
->flags
& NF_NAT_RANGE_PROTO_OFFSET
) &&
574 l4proto_in_range(tuple
, maniptype
,
576 &range
->max_proto
) &&
577 (range
->min_proto
.all
== range
->max_proto
.all
||
578 !nf_nat_used_tuple(tuple
, ct
)))
580 } else if (!nf_nat_used_tuple(tuple
, ct
)) {
585 /* Last chance: get protocol to try to obtain unique tuple. */
586 nf_nat_l4proto_unique_tuple(tuple
, range
, maniptype
, ct
);
589 struct nf_conn_nat
*nf_ct_nat_ext_add(struct nf_conn
*ct
)
591 struct nf_conn_nat
*nat
= nfct_nat(ct
);
595 if (!nf_ct_is_confirmed(ct
))
596 nat
= nf_ct_ext_add(ct
, NF_CT_EXT_NAT
, GFP_ATOMIC
);
600 EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add
);
603 nf_nat_setup_info(struct nf_conn
*ct
,
604 const struct nf_nat_range2
*range
,
605 enum nf_nat_manip_type maniptype
)
607 struct net
*net
= nf_ct_net(ct
);
608 struct nf_conntrack_tuple curr_tuple
, new_tuple
;
610 /* Can't setup nat info for confirmed ct. */
611 if (nf_ct_is_confirmed(ct
))
614 WARN_ON(maniptype
!= NF_NAT_MANIP_SRC
&&
615 maniptype
!= NF_NAT_MANIP_DST
);
617 if (WARN_ON(nf_nat_initialized(ct
, maniptype
)))
620 /* What we've got will look like inverse of reply. Normally
621 * this is what is in the conntrack, except for prior
622 * manipulations (future optimization: if num_manips == 0,
623 * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
625 nf_ct_invert_tuple(&curr_tuple
,
626 &ct
->tuplehash
[IP_CT_DIR_REPLY
].tuple
);
628 get_unique_tuple(&new_tuple
, &curr_tuple
, range
, ct
, maniptype
);
630 if (!nf_ct_tuple_equal(&new_tuple
, &curr_tuple
)) {
631 struct nf_conntrack_tuple reply
;
633 /* Alter conntrack table so will recognize replies. */
634 nf_ct_invert_tuple(&reply
, &new_tuple
);
635 nf_conntrack_alter_reply(ct
, &reply
);
637 /* Non-atomic: we own this at the moment. */
638 if (maniptype
== NF_NAT_MANIP_SRC
)
639 ct
->status
|= IPS_SRC_NAT
;
641 ct
->status
|= IPS_DST_NAT
;
643 if (nfct_help(ct
) && !nfct_seqadj(ct
))
644 if (!nfct_seqadj_ext_add(ct
))
648 if (maniptype
== NF_NAT_MANIP_SRC
) {
649 unsigned int srchash
;
652 srchash
= hash_by_src(net
,
653 &ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].tuple
);
654 lock
= &nf_nat_locks
[srchash
% CONNTRACK_LOCKS
];
656 hlist_add_head_rcu(&ct
->nat_bysource
,
657 &nf_nat_bysource
[srchash
]);
658 spin_unlock_bh(lock
);
662 if (maniptype
== NF_NAT_MANIP_DST
)
663 ct
->status
|= IPS_DST_NAT_DONE
;
665 ct
->status
|= IPS_SRC_NAT_DONE
;
669 EXPORT_SYMBOL(nf_nat_setup_info
);
672 __nf_nat_alloc_null_binding(struct nf_conn
*ct
, enum nf_nat_manip_type manip
)
674 /* Force range to this IP; let proto decide mapping for
675 * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
676 * Use reply in case it's already been mangled (eg local packet).
678 union nf_inet_addr ip
=
679 (manip
== NF_NAT_MANIP_SRC
?
680 ct
->tuplehash
[IP_CT_DIR_REPLY
].tuple
.dst
.u3
:
681 ct
->tuplehash
[IP_CT_DIR_REPLY
].tuple
.src
.u3
);
682 struct nf_nat_range2 range
= {
683 .flags
= NF_NAT_RANGE_MAP_IPS
,
687 return nf_nat_setup_info(ct
, &range
, manip
);
691 nf_nat_alloc_null_binding(struct nf_conn
*ct
, unsigned int hooknum
)
693 return __nf_nat_alloc_null_binding(ct
, HOOK2MANIP(hooknum
));
695 EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding
);
697 /* Do packet manipulations according to nf_nat_setup_info. */
698 unsigned int nf_nat_packet(struct nf_conn
*ct
,
699 enum ip_conntrack_info ctinfo
,
700 unsigned int hooknum
,
703 enum nf_nat_manip_type mtype
= HOOK2MANIP(hooknum
);
704 enum ip_conntrack_dir dir
= CTINFO2DIR(ctinfo
);
705 unsigned int verdict
= NF_ACCEPT
;
706 unsigned long statusbit
;
708 if (mtype
== NF_NAT_MANIP_SRC
)
709 statusbit
= IPS_SRC_NAT
;
711 statusbit
= IPS_DST_NAT
;
713 /* Invert if this is reply dir. */
714 if (dir
== IP_CT_DIR_REPLY
)
715 statusbit
^= IPS_NAT_MASK
;
717 /* Non-atomic: these bits don't change. */
718 if (ct
->status
& statusbit
)
719 verdict
= nf_nat_manip_pkt(skb
, ct
, mtype
, dir
);
723 EXPORT_SYMBOL_GPL(nf_nat_packet
);
726 nf_nat_inet_fn(void *priv
, struct sk_buff
*skb
,
727 const struct nf_hook_state
*state
)
730 enum ip_conntrack_info ctinfo
;
731 struct nf_conn_nat
*nat
;
732 /* maniptype == SRC for postrouting. */
733 enum nf_nat_manip_type maniptype
= HOOK2MANIP(state
->hook
);
735 ct
= nf_ct_get(skb
, &ctinfo
);
736 /* Can't track? It's not due to stress, or conntrack would
737 * have dropped it. Hence it's the user's responsibilty to
738 * packet filter it out, or implement conntrack/NAT for that
748 case IP_CT_RELATED_REPLY
:
749 /* Only ICMPs can be IP_CT_IS_REPLY. Fallthrough */
751 /* Seen it before? This can happen for loopback, retrans,
754 if (!nf_nat_initialized(ct
, maniptype
)) {
755 struct nf_nat_lookup_hook_priv
*lpriv
= priv
;
756 struct nf_hook_entries
*e
= rcu_dereference(lpriv
->entries
);
763 for (i
= 0; i
< e
->num_hook_entries
; i
++) {
764 ret
= e
->hooks
[i
].hook(e
->hooks
[i
].priv
, skb
,
766 if (ret
!= NF_ACCEPT
)
768 if (nf_nat_initialized(ct
, maniptype
))
772 ret
= nf_nat_alloc_null_binding(ct
, state
->hook
);
773 if (ret
!= NF_ACCEPT
)
776 pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n",
777 maniptype
== NF_NAT_MANIP_SRC
? "SRC" : "DST",
779 if (nf_nat_oif_changed(state
->hook
, ctinfo
, nat
,
786 WARN_ON(ctinfo
!= IP_CT_ESTABLISHED
&&
787 ctinfo
!= IP_CT_ESTABLISHED_REPLY
);
788 if (nf_nat_oif_changed(state
->hook
, ctinfo
, nat
, state
->out
))
792 return nf_nat_packet(ct
, ctinfo
, state
->hook
, skb
);
795 nf_ct_kill_acct(ct
, ctinfo
, skb
);
798 EXPORT_SYMBOL_GPL(nf_nat_inet_fn
);
800 struct nf_nat_proto_clean
{
805 /* kill conntracks with affected NAT section */
806 static int nf_nat_proto_remove(struct nf_conn
*i
, void *data
)
808 const struct nf_nat_proto_clean
*clean
= data
;
810 if ((clean
->l3proto
&& nf_ct_l3num(i
) != clean
->l3proto
) ||
811 (clean
->l4proto
&& nf_ct_protonum(i
) != clean
->l4proto
))
814 return i
->status
& IPS_NAT_MASK
? 1 : 0;
817 static void __nf_nat_cleanup_conntrack(struct nf_conn
*ct
)
821 h
= hash_by_src(nf_ct_net(ct
), &ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].tuple
);
822 spin_lock_bh(&nf_nat_locks
[h
% CONNTRACK_LOCKS
]);
823 hlist_del_rcu(&ct
->nat_bysource
);
824 spin_unlock_bh(&nf_nat_locks
[h
% CONNTRACK_LOCKS
]);
827 static int nf_nat_proto_clean(struct nf_conn
*ct
, void *data
)
829 if (nf_nat_proto_remove(ct
, data
))
832 /* This module is being removed and conntrack has nat null binding.
833 * Remove it from bysource hash, as the table will be freed soon.
835 * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack()
836 * will delete entry from already-freed table.
838 if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT
, &ct
->status
))
839 __nf_nat_cleanup_conntrack(ct
);
841 /* don't delete conntrack. Although that would make things a lot
842 * simpler, we'd end up flushing all conntracks on nat rmmod.
847 /* No one using conntrack by the time this called. */
848 static void nf_nat_cleanup_conntrack(struct nf_conn
*ct
)
850 if (ct
->status
& IPS_SRC_NAT_DONE
)
851 __nf_nat_cleanup_conntrack(ct
);
854 static struct nf_ct_ext_type nat_extend __read_mostly
= {
855 .len
= sizeof(struct nf_conn_nat
),
856 .align
= __alignof__(struct nf_conn_nat
),
857 .destroy
= nf_nat_cleanup_conntrack
,
861 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
863 #include <linux/netfilter/nfnetlink.h>
864 #include <linux/netfilter/nfnetlink_conntrack.h>
866 static const struct nla_policy protonat_nla_policy
[CTA_PROTONAT_MAX
+1] = {
867 [CTA_PROTONAT_PORT_MIN
] = { .type
= NLA_U16
},
868 [CTA_PROTONAT_PORT_MAX
] = { .type
= NLA_U16
},
871 static int nf_nat_l4proto_nlattr_to_range(struct nlattr
*tb
[],
872 struct nf_nat_range2
*range
)
874 if (tb
[CTA_PROTONAT_PORT_MIN
]) {
875 range
->min_proto
.all
= nla_get_be16(tb
[CTA_PROTONAT_PORT_MIN
]);
876 range
->max_proto
.all
= range
->min_proto
.all
;
877 range
->flags
|= NF_NAT_RANGE_PROTO_SPECIFIED
;
879 if (tb
[CTA_PROTONAT_PORT_MAX
]) {
880 range
->max_proto
.all
= nla_get_be16(tb
[CTA_PROTONAT_PORT_MAX
]);
881 range
->flags
|= NF_NAT_RANGE_PROTO_SPECIFIED
;
886 static int nfnetlink_parse_nat_proto(struct nlattr
*attr
,
887 const struct nf_conn
*ct
,
888 struct nf_nat_range2
*range
)
890 struct nlattr
*tb
[CTA_PROTONAT_MAX
+1];
893 err
= nla_parse_nested_deprecated(tb
, CTA_PROTONAT_MAX
, attr
,
894 protonat_nla_policy
, NULL
);
898 return nf_nat_l4proto_nlattr_to_range(tb
, range
);
901 static const struct nla_policy nat_nla_policy
[CTA_NAT_MAX
+1] = {
902 [CTA_NAT_V4_MINIP
] = { .type
= NLA_U32
},
903 [CTA_NAT_V4_MAXIP
] = { .type
= NLA_U32
},
904 [CTA_NAT_V6_MINIP
] = { .len
= sizeof(struct in6_addr
) },
905 [CTA_NAT_V6_MAXIP
] = { .len
= sizeof(struct in6_addr
) },
906 [CTA_NAT_PROTO
] = { .type
= NLA_NESTED
},
909 static int nf_nat_ipv4_nlattr_to_range(struct nlattr
*tb
[],
910 struct nf_nat_range2
*range
)
912 if (tb
[CTA_NAT_V4_MINIP
]) {
913 range
->min_addr
.ip
= nla_get_be32(tb
[CTA_NAT_V4_MINIP
]);
914 range
->flags
|= NF_NAT_RANGE_MAP_IPS
;
917 if (tb
[CTA_NAT_V4_MAXIP
])
918 range
->max_addr
.ip
= nla_get_be32(tb
[CTA_NAT_V4_MAXIP
]);
920 range
->max_addr
.ip
= range
->min_addr
.ip
;
925 static int nf_nat_ipv6_nlattr_to_range(struct nlattr
*tb
[],
926 struct nf_nat_range2
*range
)
928 if (tb
[CTA_NAT_V6_MINIP
]) {
929 nla_memcpy(&range
->min_addr
.ip6
, tb
[CTA_NAT_V6_MINIP
],
930 sizeof(struct in6_addr
));
931 range
->flags
|= NF_NAT_RANGE_MAP_IPS
;
934 if (tb
[CTA_NAT_V6_MAXIP
])
935 nla_memcpy(&range
->max_addr
.ip6
, tb
[CTA_NAT_V6_MAXIP
],
936 sizeof(struct in6_addr
));
938 range
->max_addr
= range
->min_addr
;
944 nfnetlink_parse_nat(const struct nlattr
*nat
,
945 const struct nf_conn
*ct
, struct nf_nat_range2
*range
)
947 struct nlattr
*tb
[CTA_NAT_MAX
+1];
950 memset(range
, 0, sizeof(*range
));
952 err
= nla_parse_nested_deprecated(tb
, CTA_NAT_MAX
, nat
,
953 nat_nla_policy
, NULL
);
957 switch (nf_ct_l3num(ct
)) {
959 err
= nf_nat_ipv4_nlattr_to_range(tb
, range
);
962 err
= nf_nat_ipv6_nlattr_to_range(tb
, range
);
965 err
= -EPROTONOSUPPORT
;
972 if (!tb
[CTA_NAT_PROTO
])
975 return nfnetlink_parse_nat_proto(tb
[CTA_NAT_PROTO
], ct
, range
);
978 /* This function is called under rcu_read_lock() */
980 nfnetlink_parse_nat_setup(struct nf_conn
*ct
,
981 enum nf_nat_manip_type manip
,
982 const struct nlattr
*attr
)
984 struct nf_nat_range2 range
;
987 /* Should not happen, restricted to creating new conntracks
990 if (WARN_ON_ONCE(nf_nat_initialized(ct
, manip
)))
993 /* No NAT information has been passed, allocate the null-binding */
995 return __nf_nat_alloc_null_binding(ct
, manip
) == NF_DROP
? -ENOMEM
: 0;
997 err
= nfnetlink_parse_nat(attr
, ct
, &range
);
1001 return nf_nat_setup_info(ct
, &range
, manip
) == NF_DROP
? -ENOMEM
: 0;
1005 nfnetlink_parse_nat_setup(struct nf_conn
*ct
,
1006 enum nf_nat_manip_type manip
,
1007 const struct nlattr
*attr
)
1013 static struct nf_ct_helper_expectfn follow_master_nat
= {
1014 .name
= "nat-follow-master",
1015 .expectfn
= nf_nat_follow_master
,
1018 int nf_nat_register_fn(struct net
*net
, u8 pf
, const struct nf_hook_ops
*ops
,
1019 const struct nf_hook_ops
*orig_nat_ops
, unsigned int ops_count
)
1021 struct nat_net
*nat_net
= net_generic(net
, nat_net_id
);
1022 struct nf_nat_hooks_net
*nat_proto_net
;
1023 struct nf_nat_lookup_hook_priv
*priv
;
1024 unsigned int hooknum
= ops
->hooknum
;
1025 struct nf_hook_ops
*nat_ops
;
1028 if (WARN_ON_ONCE(pf
>= ARRAY_SIZE(nat_net
->nat_proto_net
)))
1031 nat_proto_net
= &nat_net
->nat_proto_net
[pf
];
1033 for (i
= 0; i
< ops_count
; i
++) {
1034 if (orig_nat_ops
[i
].hooknum
== hooknum
) {
1040 if (WARN_ON_ONCE(i
== ops_count
))
1043 mutex_lock(&nf_nat_proto_mutex
);
1044 if (!nat_proto_net
->nat_hook_ops
) {
1045 WARN_ON(nat_proto_net
->users
!= 0);
1047 nat_ops
= kmemdup(orig_nat_ops
, sizeof(*orig_nat_ops
) * ops_count
, GFP_KERNEL
);
1049 mutex_unlock(&nf_nat_proto_mutex
);
1053 for (i
= 0; i
< ops_count
; i
++) {
1054 priv
= kzalloc(sizeof(*priv
), GFP_KERNEL
);
1056 nat_ops
[i
].priv
= priv
;
1059 mutex_unlock(&nf_nat_proto_mutex
);
1061 kfree(nat_ops
[--i
].priv
);
1066 ret
= nf_register_net_hooks(net
, nat_ops
, ops_count
);
1068 mutex_unlock(&nf_nat_proto_mutex
);
1069 for (i
= 0; i
< ops_count
; i
++)
1070 kfree(nat_ops
[i
].priv
);
1075 nat_proto_net
->nat_hook_ops
= nat_ops
;
1078 nat_ops
= nat_proto_net
->nat_hook_ops
;
1079 priv
= nat_ops
[hooknum
].priv
;
1080 if (WARN_ON_ONCE(!priv
)) {
1081 mutex_unlock(&nf_nat_proto_mutex
);
1085 ret
= nf_hook_entries_insert_raw(&priv
->entries
, ops
);
1087 nat_proto_net
->users
++;
1089 mutex_unlock(&nf_nat_proto_mutex
);
1093 void nf_nat_unregister_fn(struct net
*net
, u8 pf
, const struct nf_hook_ops
*ops
,
1094 unsigned int ops_count
)
1096 struct nat_net
*nat_net
= net_generic(net
, nat_net_id
);
1097 struct nf_nat_hooks_net
*nat_proto_net
;
1098 struct nf_nat_lookup_hook_priv
*priv
;
1099 struct nf_hook_ops
*nat_ops
;
1100 int hooknum
= ops
->hooknum
;
1103 if (pf
>= ARRAY_SIZE(nat_net
->nat_proto_net
))
1106 nat_proto_net
= &nat_net
->nat_proto_net
[pf
];
1108 mutex_lock(&nf_nat_proto_mutex
);
1109 if (WARN_ON(nat_proto_net
->users
== 0))
1112 nat_proto_net
->users
--;
1114 nat_ops
= nat_proto_net
->nat_hook_ops
;
1115 for (i
= 0; i
< ops_count
; i
++) {
1116 if (nat_ops
[i
].hooknum
== hooknum
) {
1121 if (WARN_ON_ONCE(i
== ops_count
))
1123 priv
= nat_ops
[hooknum
].priv
;
1124 nf_hook_entries_delete_raw(&priv
->entries
, ops
);
1126 if (nat_proto_net
->users
== 0) {
1127 nf_unregister_net_hooks(net
, nat_ops
, ops_count
);
1129 for (i
= 0; i
< ops_count
; i
++) {
1130 priv
= nat_ops
[i
].priv
;
1131 kfree_rcu(priv
, rcu_head
);
1134 nat_proto_net
->nat_hook_ops
= NULL
;
1138 mutex_unlock(&nf_nat_proto_mutex
);
1141 static struct pernet_operations nat_net_ops
= {
1143 .size
= sizeof(struct nat_net
),
1146 static struct nf_nat_hook nat_hook
= {
1147 .parse_nat_setup
= nfnetlink_parse_nat_setup
,
1149 .decode_session
= __nf_nat_decode_session
,
1151 .manip_pkt
= nf_nat_manip_pkt
,
1154 static int __init
nf_nat_init(void)
1158 /* Leave them the same for the moment. */
1159 nf_nat_htable_size
= nf_conntrack_htable_size
;
1160 if (nf_nat_htable_size
< CONNTRACK_LOCKS
)
1161 nf_nat_htable_size
= CONNTRACK_LOCKS
;
1163 nf_nat_bysource
= nf_ct_alloc_hashtable(&nf_nat_htable_size
, 0);
1164 if (!nf_nat_bysource
)
1167 ret
= nf_ct_extend_register(&nat_extend
);
1169 kvfree(nf_nat_bysource
);
1170 pr_err("Unable to register extension\n");
1174 for (i
= 0; i
< CONNTRACK_LOCKS
; i
++)
1175 spin_lock_init(&nf_nat_locks
[i
]);
1177 ret
= register_pernet_subsys(&nat_net_ops
);
1179 nf_ct_extend_unregister(&nat_extend
);
1183 nf_ct_helper_expectfn_register(&follow_master_nat
);
1185 WARN_ON(nf_nat_hook
!= NULL
);
1186 RCU_INIT_POINTER(nf_nat_hook
, &nat_hook
);
1191 static void __exit
nf_nat_cleanup(void)
1193 struct nf_nat_proto_clean clean
= {};
1195 nf_ct_iterate_destroy(nf_nat_proto_clean
, &clean
);
1197 nf_ct_extend_unregister(&nat_extend
);
1198 nf_ct_helper_expectfn_unregister(&follow_master_nat
);
1199 RCU_INIT_POINTER(nf_nat_hook
, NULL
);
1202 kvfree(nf_nat_bysource
);
1203 unregister_pernet_subsys(&nat_net_ops
);
1206 MODULE_LICENSE("GPL");
1208 module_init(nf_nat_init
);
1209 module_exit(nf_nat_cleanup
);