]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - net/ipv4/netfilter/ip_nat_core.c
[NETFILTER]: reduce netfilter sk_buff enlargement
[mirror_ubuntu-zesty-kernel.git] / net / ipv4 / netfilter / ip_nat_core.c
CommitLineData
1da177e4
LT
1/* NAT for netfilter; shared with compatibility layer. */
2
3/* (C) 1999-2001 Paul `Rusty' Russell
4 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/module.h>
12#include <linux/types.h>
13#include <linux/timer.h>
14#include <linux/skbuff.h>
15#include <linux/netfilter_ipv4.h>
16#include <linux/vmalloc.h>
17#include <net/checksum.h>
18#include <net/icmp.h>
19#include <net/ip.h>
20#include <net/tcp.h> /* For tcp_prot in getorigdst */
21#include <linux/icmp.h>
22#include <linux/udp.h>
23#include <linux/jhash.h>
24
e45b1be8
PM
25#define ASSERT_READ_LOCK(x)
26#define ASSERT_WRITE_LOCK(x)
1da177e4
LT
27
28#include <linux/netfilter_ipv4/ip_conntrack.h>
29#include <linux/netfilter_ipv4/ip_conntrack_core.h>
30#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
31#include <linux/netfilter_ipv4/ip_nat.h>
32#include <linux/netfilter_ipv4/ip_nat_protocol.h>
33#include <linux/netfilter_ipv4/ip_nat_core.h>
34#include <linux/netfilter_ipv4/ip_nat_helper.h>
35#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
36#include <linux/netfilter_ipv4/listhelp.h>
37
38#if 0
39#define DEBUGP printk
40#else
41#define DEBUGP(format, args...)
42#endif
43
e45b1be8 44DEFINE_RWLOCK(ip_nat_lock);
1da177e4
LT
45
46/* Calculated at init based on memory size */
47static unsigned int ip_nat_htable_size;
48
49static struct list_head *bysource;
50struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO];
51
52
53/* We keep an extra hash for each conntrack, for fast searching. */
54static inline unsigned int
55hash_by_src(const struct ip_conntrack_tuple *tuple)
56{
57 /* Original src, to ensure we map it consistently if poss. */
58 return jhash_3words(tuple->src.ip, tuple->src.u.all,
59 tuple->dst.protonum, 0) % ip_nat_htable_size;
60}
61
62/* Noone using conntrack by the time this called. */
63static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
64{
65 if (!(conn->status & IPS_NAT_DONE_MASK))
66 return;
67
e45b1be8 68 write_lock_bh(&ip_nat_lock);
1da177e4 69 list_del(&conn->nat.info.bysource);
e45b1be8 70 write_unlock_bh(&ip_nat_lock);
1da177e4
LT
71}
72
73/* We do checksum mangling, so if they were wrong before they're still
74 * wrong. Also works for incomplete packets (eg. ICMP dest
75 * unreachables.) */
76u_int16_t
77ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
78{
79 u_int32_t diffs[] = { oldvalinv, newval };
80 return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
81 oldcheck^0xFFFF));
82}
83
84/* Is this tuple already taken? (not by us) */
85int
86ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
87 const struct ip_conntrack *ignored_conntrack)
88{
89 /* Conntrack tracking doesn't keep track of outgoing tuples; only
90 incoming ones. NAT means they don't have a fixed mapping,
91 so we invert the tuple and look for the incoming reply.
92
93 We could keep a separate hash if this proves too slow. */
94 struct ip_conntrack_tuple reply;
95
96 invert_tuplepr(&reply, tuple);
97 return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
98}
99
100/* If we source map this tuple so reply looks like reply_tuple, will
101 * that meet the constraints of range. */
102static int
103in_range(const struct ip_conntrack_tuple *tuple,
104 const struct ip_nat_range *range)
105{
106 struct ip_nat_protocol *proto = ip_nat_find_proto(tuple->dst.protonum);
107
108 /* If we are supposed to map IPs, then we must be in the
109 range specified, otherwise let this drag us onto a new src IP. */
110 if (range->flags & IP_NAT_RANGE_MAP_IPS) {
111 if (ntohl(tuple->src.ip) < ntohl(range->min_ip)
112 || ntohl(tuple->src.ip) > ntohl(range->max_ip))
113 return 0;
114 }
115
116 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
117 || proto->in_range(tuple, IP_NAT_MANIP_SRC,
118 &range->min, &range->max))
119 return 1;
120
121 return 0;
122}
123
124static inline int
125same_src(const struct ip_conntrack *ct,
126 const struct ip_conntrack_tuple *tuple)
127{
128 return (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
129 == tuple->dst.protonum
130 && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
131 == tuple->src.ip
132 && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
133 == tuple->src.u.all);
134}
135
136/* Only called for SRC manip */
137static int
138find_appropriate_src(const struct ip_conntrack_tuple *tuple,
139 struct ip_conntrack_tuple *result,
140 const struct ip_nat_range *range)
141{
142 unsigned int h = hash_by_src(tuple);
143 struct ip_conntrack *ct;
144
e45b1be8 145 read_lock_bh(&ip_nat_lock);
1da177e4
LT
146 list_for_each_entry(ct, &bysource[h], nat.info.bysource) {
147 if (same_src(ct, tuple)) {
148 /* Copy source part from reply tuple. */
149 invert_tuplepr(result,
150 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
151 result->dst = tuple->dst;
152
153 if (in_range(result, range)) {
e45b1be8 154 read_unlock_bh(&ip_nat_lock);
1da177e4
LT
155 return 1;
156 }
157 }
158 }
e45b1be8 159 read_unlock_bh(&ip_nat_lock);
1da177e4
LT
160 return 0;
161}
162
163/* For [FUTURE] fragmentation handling, we want the least-used
164 src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
165 if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
166 1-65535, we don't do pro-rata allocation based on ports; we choose
167 the ip with the lowest src-ip/dst-ip/proto usage.
168*/
169static void
170find_best_ips_proto(struct ip_conntrack_tuple *tuple,
171 const struct ip_nat_range *range,
172 const struct ip_conntrack *conntrack,
173 enum ip_nat_manip_type maniptype)
174{
175 u_int32_t *var_ipp;
176 /* Host order */
177 u_int32_t minip, maxip, j;
178
179 /* No IP mapping? Do nothing. */
180 if (!(range->flags & IP_NAT_RANGE_MAP_IPS))
181 return;
182
183 if (maniptype == IP_NAT_MANIP_SRC)
184 var_ipp = &tuple->src.ip;
185 else
186 var_ipp = &tuple->dst.ip;
187
188 /* Fast path: only one choice. */
189 if (range->min_ip == range->max_ip) {
190 *var_ipp = range->min_ip;
191 return;
192 }
193
194 /* Hashing source and destination IPs gives a fairly even
195 * spread in practice (if there are a small number of IPs
196 * involved, there usually aren't that many connections
197 * anyway). The consistency means that servers see the same
198 * client coming from the same IP (some Internet Banking sites
199 * like this), even across reboots. */
200 minip = ntohl(range->min_ip);
201 maxip = ntohl(range->max_ip);
202 j = jhash_2words(tuple->src.ip, tuple->dst.ip, 0);
203 *var_ipp = htonl(minip + j % (maxip - minip + 1));
204}
205
206/* Manipulate the tuple into the range given. For NF_IP_POST_ROUTING,
207 * we change the source to map into the range. For NF_IP_PRE_ROUTING
208 * and NF_IP_LOCAL_OUT, we change the destination to map into the
209 * range. It might not be possible to get a unique tuple, but we try.
210 * At worst (or if we race), we will end up with a final duplicate in
211 * __ip_conntrack_confirm and drop the packet. */
212static void
213get_unique_tuple(struct ip_conntrack_tuple *tuple,
214 const struct ip_conntrack_tuple *orig_tuple,
215 const struct ip_nat_range *range,
216 struct ip_conntrack *conntrack,
217 enum ip_nat_manip_type maniptype)
218{
219 struct ip_nat_protocol *proto
220 = ip_nat_find_proto(orig_tuple->dst.protonum);
221
222 /* 1) If this srcip/proto/src-proto-part is currently mapped,
223 and that same mapping gives a unique tuple within the given
224 range, use that.
225
226 This is only required for source (ie. NAT/masq) mappings.
227 So far, we don't do local source mappings, so multiple
228 manips not an issue. */
229 if (maniptype == IP_NAT_MANIP_SRC) {
230 if (find_appropriate_src(orig_tuple, tuple, range)) {
231 DEBUGP("get_unique_tuple: Found current src map\n");
232 if (!ip_nat_used_tuple(tuple, conntrack))
233 return;
234 }
235 }
236
237 /* 2) Select the least-used IP/proto combination in the given
238 range. */
239 *tuple = *orig_tuple;
240 find_best_ips_proto(tuple, range, conntrack, maniptype);
241
242 /* 3) The per-protocol part of the manip is made to map into
243 the range to make a unique tuple. */
244
245 /* Only bother mapping if it's not already in range and unique */
246 if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
247 || proto->in_range(tuple, maniptype, &range->min, &range->max))
248 && !ip_nat_used_tuple(tuple, conntrack))
249 return;
250
251 /* Last change: get protocol to try to obtain unique tuple. */
252 proto->unique_tuple(tuple, range, maniptype, conntrack);
253}
254
255unsigned int
256ip_nat_setup_info(struct ip_conntrack *conntrack,
257 const struct ip_nat_range *range,
258 unsigned int hooknum)
259{
260 struct ip_conntrack_tuple curr_tuple, new_tuple;
261 struct ip_nat_info *info = &conntrack->nat.info;
262 int have_to_hash = !(conntrack->status & IPS_NAT_DONE_MASK);
263 enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum);
264
265 IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
266 || hooknum == NF_IP_POST_ROUTING
267 || hooknum == NF_IP_LOCAL_IN
268 || hooknum == NF_IP_LOCAL_OUT);
269 BUG_ON(ip_nat_initialized(conntrack, maniptype));
270
271 /* What we've got will look like inverse of reply. Normally
272 this is what is in the conntrack, except for prior
273 manipulations (future optimization: if num_manips == 0,
274 orig_tp =
275 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
276 invert_tuplepr(&curr_tuple,
277 &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
278
279 get_unique_tuple(&new_tuple, &curr_tuple, range, conntrack, maniptype);
280
281 if (!ip_ct_tuple_equal(&new_tuple, &curr_tuple)) {
282 struct ip_conntrack_tuple reply;
283
284 /* Alter conntrack table so will recognize replies. */
285 invert_tuplepr(&reply, &new_tuple);
286 ip_conntrack_alter_reply(conntrack, &reply);
287
288 /* Non-atomic: we own this at the moment. */
289 if (maniptype == IP_NAT_MANIP_SRC)
290 conntrack->status |= IPS_SRC_NAT;
291 else
292 conntrack->status |= IPS_DST_NAT;
293 }
294
295 /* Place in source hash if this is the first time. */
296 if (have_to_hash) {
297 unsigned int srchash
298 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
299 .tuple);
e45b1be8 300 write_lock_bh(&ip_nat_lock);
1da177e4 301 list_add(&info->bysource, &bysource[srchash]);
e45b1be8 302 write_unlock_bh(&ip_nat_lock);
1da177e4
LT
303 }
304
305 /* It's done. */
306 if (maniptype == IP_NAT_MANIP_DST)
307 set_bit(IPS_DST_NAT_DONE_BIT, &conntrack->status);
308 else
309 set_bit(IPS_SRC_NAT_DONE_BIT, &conntrack->status);
310
311 return NF_ACCEPT;
312}
313
314/* Returns true if succeeded. */
315static int
316manip_pkt(u_int16_t proto,
317 struct sk_buff **pskb,
318 unsigned int iphdroff,
319 const struct ip_conntrack_tuple *target,
320 enum ip_nat_manip_type maniptype)
321{
322 struct iphdr *iph;
323
1da177e4
LT
324 if (!skb_ip_make_writable(pskb, iphdroff + sizeof(*iph)))
325 return 0;
326
327 iph = (void *)(*pskb)->data + iphdroff;
328
329 /* Manipulate protcol part. */
330 if (!ip_nat_find_proto(proto)->manip_pkt(pskb, iphdroff,
331 target, maniptype))
332 return 0;
333
334 iph = (void *)(*pskb)->data + iphdroff;
335
336 if (maniptype == IP_NAT_MANIP_SRC) {
337 iph->check = ip_nat_cheat_check(~iph->saddr, target->src.ip,
338 iph->check);
339 iph->saddr = target->src.ip;
340 } else {
341 iph->check = ip_nat_cheat_check(~iph->daddr, target->dst.ip,
342 iph->check);
343 iph->daddr = target->dst.ip;
344 }
345 return 1;
346}
347
348/* Do packet manipulations according to ip_nat_setup_info. */
349unsigned int nat_packet(struct ip_conntrack *ct,
350 enum ip_conntrack_info ctinfo,
351 unsigned int hooknum,
352 struct sk_buff **pskb)
353{
354 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
355 unsigned long statusbit;
356 enum ip_nat_manip_type mtype = HOOK2MANIP(hooknum);
357
1da177e4
LT
358 if (mtype == IP_NAT_MANIP_SRC)
359 statusbit = IPS_SRC_NAT;
360 else
361 statusbit = IPS_DST_NAT;
362
363 /* Invert if this is reply dir. */
364 if (dir == IP_CT_DIR_REPLY)
365 statusbit ^= IPS_NAT_MASK;
366
367 /* Non-atomic: these bits don't change. */
368 if (ct->status & statusbit) {
369 struct ip_conntrack_tuple target;
370
371 /* We are aiming to look like inverse of other direction. */
372 invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
373
374 if (!manip_pkt(target.dst.protonum, pskb, 0, &target, mtype))
375 return NF_DROP;
376 }
377 return NF_ACCEPT;
378}
379
380/* Dir is direction ICMP is coming from (opposite to packet it contains) */
381int icmp_reply_translation(struct sk_buff **pskb,
382 struct ip_conntrack *ct,
383 enum ip_nat_manip_type manip,
384 enum ip_conntrack_dir dir)
385{
386 struct {
387 struct icmphdr icmp;
388 struct iphdr ip;
389 } *inside;
390 struct ip_conntrack_tuple inner, target;
391 int hdrlen = (*pskb)->nh.iph->ihl * 4;
392
393 if (!skb_ip_make_writable(pskb, hdrlen + sizeof(*inside)))
394 return 0;
395
396 inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
397
398 /* We're actually going to mangle it beyond trivial checksum
399 adjustment, so make sure the current checksum is correct. */
400 if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY) {
401 hdrlen = (*pskb)->nh.iph->ihl * 4;
402 if ((u16)csum_fold(skb_checksum(*pskb, hdrlen,
403 (*pskb)->len - hdrlen, 0)))
404 return 0;
405 }
406
407 /* Must be RELATED */
408 IP_NF_ASSERT((*pskb)->nfctinfo == IP_CT_RELATED ||
409 (*pskb)->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY);
410
411 /* Redirects on non-null nats must be dropped, else they'll
412 start talking to each other without our translation, and be
413 confused... --RR */
414 if (inside->icmp.type == ICMP_REDIRECT) {
415 /* If NAT isn't finished, assume it and drop. */
416 if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
417 return 0;
418
419 if (ct->status & IPS_NAT_MASK)
420 return 0;
421 }
422
423 DEBUGP("icmp_reply_translation: translating error %p manp %u dir %s\n",
424 *pskb, manip, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
425
426 if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 +
427 sizeof(struct icmphdr) + inside->ip.ihl*4,
428 &inner, ip_ct_find_proto(inside->ip.protocol)))
429 return 0;
430
431 /* Change inner back to look like incoming packet. We do the
432 opposite manip on this hook to normal, because it might not
433 pass all hooks (locally-generated ICMP). Consider incoming
434 packet: PREROUTING (DST manip), routing produces ICMP, goes
435 through POSTROUTING (which must correct the DST manip). */
436 if (!manip_pkt(inside->ip.protocol, pskb,
437 (*pskb)->nh.iph->ihl*4
438 + sizeof(inside->icmp),
439 &ct->tuplehash[!dir].tuple,
440 !manip))
441 return 0;
442
443 /* Reloading "inside" here since manip_pkt inner. */
444 inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
445 inside->icmp.checksum = 0;
446 inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen,
447 (*pskb)->len - hdrlen,
448 0));
449
450 /* Change outer to look the reply to an incoming packet
451 * (proto 0 means don't invert per-proto part). */
452
453 /* Obviously, we need to NAT destination IP, but source IP
454 should be NAT'ed only if it is from a NAT'd host.
455
456 Explanation: some people use NAT for anonymizing. Also,
457 CERT recommends dropping all packets from private IP
458 addresses (although ICMP errors from internal links with
459 such addresses are not too uncommon, as Alan Cox points
460 out) */
461 if (manip != IP_NAT_MANIP_SRC
462 || ((*pskb)->nh.iph->saddr == ct->tuplehash[dir].tuple.src.ip)) {
463 invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
464 if (!manip_pkt(0, pskb, 0, &target, manip))
465 return 0;
466 }
467
468 return 1;
469}
470
471/* Protocol registration. */
472int ip_nat_protocol_register(struct ip_nat_protocol *proto)
473{
474 int ret = 0;
475
e45b1be8 476 write_lock_bh(&ip_nat_lock);
1da177e4
LT
477 if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) {
478 ret = -EBUSY;
479 goto out;
480 }
481 ip_nat_protos[proto->protonum] = proto;
482 out:
e45b1be8 483 write_unlock_bh(&ip_nat_lock);
1da177e4
LT
484 return ret;
485}
486
487/* Noone stores the protocol anywhere; simply delete it. */
488void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
489{
e45b1be8 490 write_lock_bh(&ip_nat_lock);
1da177e4 491 ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol;
e45b1be8 492 write_unlock_bh(&ip_nat_lock);
1da177e4
LT
493
494 /* Someone could be still looking at the proto in a bh. */
495 synchronize_net();
496}
497
498int __init ip_nat_init(void)
499{
500 size_t i;
501
502 /* Leave them the same for the moment. */
503 ip_nat_htable_size = ip_conntrack_htable_size;
504
505 /* One vmalloc for both hash tables */
506 bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size);
507 if (!bysource)
508 return -ENOMEM;
509
510 /* Sew in builtin protocols. */
e45b1be8 511 write_lock_bh(&ip_nat_lock);
1da177e4
LT
512 for (i = 0; i < MAX_IP_NAT_PROTO; i++)
513 ip_nat_protos[i] = &ip_nat_unknown_protocol;
514 ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp;
515 ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp;
516 ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp;
e45b1be8 517 write_unlock_bh(&ip_nat_lock);
1da177e4
LT
518
519 for (i = 0; i < ip_nat_htable_size; i++) {
520 INIT_LIST_HEAD(&bysource[i]);
521 }
522
523 /* FIXME: Man, this is a hack. <SIGH> */
524 IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
525 ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
526
527 /* Initialize fake conntrack so that NAT will skip it */
528 ip_conntrack_untracked.status |= IPS_NAT_DONE_MASK;
529 return 0;
530}
531
532/* Clear NAT section of all conntracks, in case we're loaded again. */
533static int clean_nat(struct ip_conntrack *i, void *data)
534{
535 memset(&i->nat, 0, sizeof(i->nat));
536 i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST);
537 return 0;
538}
539
540/* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
541void ip_nat_cleanup(void)
542{
543 ip_ct_iterate_cleanup(&clean_nat, NULL);
544 ip_conntrack_destroyed = NULL;
545 vfree(bysource);
546}