]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blob - net/ipv4/netfilter/ip_conntrack_core.c
[NETFILTER]: reduce netfilter sk_buff enlargement
[mirror_ubuntu-zesty-kernel.git] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 *
12 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13 * - new API and handling of conntrack/nat helpers
14 * - now capable of multiple expectations for one master
15 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16 * - add usage/reference counts to ip_conntrack_expect
17 * - export ip_conntrack[_expect]_{find_get,put} functions
18 * */
19
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
23 #include <linux/ip.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
31 #include <net/ip.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 #include <linux/err.h>
38 #include <linux/percpu.h>
39 #include <linux/moduleparam.h>
40
41 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
42 registrations, conntrack timers*/
43 #define ASSERT_READ_LOCK(x)
44 #define ASSERT_WRITE_LOCK(x)
45
46 #include <linux/netfilter_ipv4/ip_conntrack.h>
47 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
50 #include <linux/netfilter_ipv4/listhelp.h>
51
52 #define IP_CONNTRACK_VERSION "2.1"
53
54 #if 0
55 #define DEBUGP printk
56 #else
57 #define DEBUGP(format, args...)
58 #endif
59
60 DEFINE_RWLOCK(ip_conntrack_lock);
61
62 /* ip_conntrack_standalone needs this */
63 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
64
65 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
66 LIST_HEAD(ip_conntrack_expect_list);
67 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
68 static LIST_HEAD(helpers);
69 unsigned int ip_conntrack_htable_size = 0;
70 int ip_conntrack_max;
71 struct list_head *ip_conntrack_hash;
72 static kmem_cache_t *ip_conntrack_cachep;
73 static kmem_cache_t *ip_conntrack_expect_cachep;
74 struct ip_conntrack ip_conntrack_untracked;
75 unsigned int ip_ct_log_invalid;
76 static LIST_HEAD(unconfirmed);
77 static int ip_conntrack_vmalloc;
78
79 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
80
81 void
82 ip_conntrack_put(struct ip_conntrack *ct)
83 {
84 IP_NF_ASSERT(ct);
85 nf_conntrack_put(&ct->ct_general);
86 }
87
88 static int ip_conntrack_hash_rnd_initted;
89 static unsigned int ip_conntrack_hash_rnd;
90
91 static u_int32_t
92 hash_conntrack(const struct ip_conntrack_tuple *tuple)
93 {
94 #if 0
95 dump_tuple(tuple);
96 #endif
97 return (jhash_3words(tuple->src.ip,
98 (tuple->dst.ip ^ tuple->dst.protonum),
99 (tuple->src.u.all | (tuple->dst.u.all << 16)),
100 ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
101 }
102
103 int
104 ip_ct_get_tuple(const struct iphdr *iph,
105 const struct sk_buff *skb,
106 unsigned int dataoff,
107 struct ip_conntrack_tuple *tuple,
108 const struct ip_conntrack_protocol *protocol)
109 {
110 /* Never happen */
111 if (iph->frag_off & htons(IP_OFFSET)) {
112 printk("ip_conntrack_core: Frag of proto %u.\n",
113 iph->protocol);
114 return 0;
115 }
116
117 tuple->src.ip = iph->saddr;
118 tuple->dst.ip = iph->daddr;
119 tuple->dst.protonum = iph->protocol;
120 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
121
122 return protocol->pkt_to_tuple(skb, dataoff, tuple);
123 }
124
125 int
126 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
127 const struct ip_conntrack_tuple *orig,
128 const struct ip_conntrack_protocol *protocol)
129 {
130 inverse->src.ip = orig->dst.ip;
131 inverse->dst.ip = orig->src.ip;
132 inverse->dst.protonum = orig->dst.protonum;
133 inverse->dst.dir = !orig->dst.dir;
134
135 return protocol->invert_tuple(inverse, orig);
136 }
137
138
139 /* ip_conntrack_expect helper functions */
140 static void unlink_expect(struct ip_conntrack_expect *exp)
141 {
142 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
143 IP_NF_ASSERT(!timer_pending(&exp->timeout));
144 list_del(&exp->list);
145 CONNTRACK_STAT_INC(expect_delete);
146 exp->master->expecting--;
147 }
148
149 static void expectation_timed_out(unsigned long ul_expect)
150 {
151 struct ip_conntrack_expect *exp = (void *)ul_expect;
152
153 write_lock_bh(&ip_conntrack_lock);
154 unlink_expect(exp);
155 write_unlock_bh(&ip_conntrack_lock);
156 ip_conntrack_expect_put(exp);
157 }
158
159 /* If an expectation for this connection is found, it gets delete from
160 * global list then returned. */
161 static struct ip_conntrack_expect *
162 find_expectation(const struct ip_conntrack_tuple *tuple)
163 {
164 struct ip_conntrack_expect *i;
165
166 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
167 /* If master is not in hash table yet (ie. packet hasn't left
168 this machine yet), how can other end know about expected?
169 Hence these are not the droids you are looking for (if
170 master ct never got confirmed, we'd hold a reference to it
171 and weird things would happen to future packets). */
172 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
173 && is_confirmed(i->master)
174 && del_timer(&i->timeout)) {
175 unlink_expect(i);
176 return i;
177 }
178 }
179 return NULL;
180 }
181
182 /* delete all expectations for this conntrack */
183 static void remove_expectations(struct ip_conntrack *ct)
184 {
185 struct ip_conntrack_expect *i, *tmp;
186
187 /* Optimization: most connection never expect any others. */
188 if (ct->expecting == 0)
189 return;
190
191 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
192 if (i->master == ct && del_timer(&i->timeout)) {
193 unlink_expect(i);
194 ip_conntrack_expect_put(i);
195 }
196 }
197 }
198
199 static void
200 clean_from_lists(struct ip_conntrack *ct)
201 {
202 unsigned int ho, hr;
203
204 DEBUGP("clean_from_lists(%p)\n", ct);
205 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
206
207 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
208 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
209 LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
210 LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
211
212 /* Destroy all pending expectations */
213 remove_expectations(ct);
214 }
215
216 static void
217 destroy_conntrack(struct nf_conntrack *nfct)
218 {
219 struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
220 struct ip_conntrack_protocol *proto;
221
222 DEBUGP("destroy_conntrack(%p)\n", ct);
223 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
224 IP_NF_ASSERT(!timer_pending(&ct->timeout));
225
226 /* To make sure we don't get any weird locking issues here:
227 * destroy_conntrack() MUST NOT be called with a write lock
228 * to ip_conntrack_lock!!! -HW */
229 proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
230 if (proto && proto->destroy)
231 proto->destroy(ct);
232
233 if (ip_conntrack_destroyed)
234 ip_conntrack_destroyed(ct);
235
236 write_lock_bh(&ip_conntrack_lock);
237 /* Expectations will have been removed in clean_from_lists,
238 * except TFTP can create an expectation on the first packet,
239 * before connection is in the list, so we need to clean here,
240 * too. */
241 remove_expectations(ct);
242
243 /* We overload first tuple to link into unconfirmed list. */
244 if (!is_confirmed(ct)) {
245 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
246 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
247 }
248
249 CONNTRACK_STAT_INC(delete);
250 write_unlock_bh(&ip_conntrack_lock);
251
252 if (ct->master)
253 ip_conntrack_put(ct->master);
254
255 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
256 kmem_cache_free(ip_conntrack_cachep, ct);
257 atomic_dec(&ip_conntrack_count);
258 }
259
260 static void death_by_timeout(unsigned long ul_conntrack)
261 {
262 struct ip_conntrack *ct = (void *)ul_conntrack;
263
264 write_lock_bh(&ip_conntrack_lock);
265 /* Inside lock so preempt is disabled on module removal path.
266 * Otherwise we can get spurious warnings. */
267 CONNTRACK_STAT_INC(delete_list);
268 clean_from_lists(ct);
269 write_unlock_bh(&ip_conntrack_lock);
270 ip_conntrack_put(ct);
271 }
272
273 static inline int
274 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
275 const struct ip_conntrack_tuple *tuple,
276 const struct ip_conntrack *ignored_conntrack)
277 {
278 ASSERT_READ_LOCK(&ip_conntrack_lock);
279 return tuplehash_to_ctrack(i) != ignored_conntrack
280 && ip_ct_tuple_equal(tuple, &i->tuple);
281 }
282
283 static struct ip_conntrack_tuple_hash *
284 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
285 const struct ip_conntrack *ignored_conntrack)
286 {
287 struct ip_conntrack_tuple_hash *h;
288 unsigned int hash = hash_conntrack(tuple);
289
290 ASSERT_READ_LOCK(&ip_conntrack_lock);
291 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
292 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
293 CONNTRACK_STAT_INC(found);
294 return h;
295 }
296 CONNTRACK_STAT_INC(searched);
297 }
298
299 return NULL;
300 }
301
302 /* Find a connection corresponding to a tuple. */
303 struct ip_conntrack_tuple_hash *
304 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
305 const struct ip_conntrack *ignored_conntrack)
306 {
307 struct ip_conntrack_tuple_hash *h;
308
309 read_lock_bh(&ip_conntrack_lock);
310 h = __ip_conntrack_find(tuple, ignored_conntrack);
311 if (h)
312 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
313 read_unlock_bh(&ip_conntrack_lock);
314
315 return h;
316 }
317
318 /* Confirm a connection given skb; places it in hash table */
319 int
320 __ip_conntrack_confirm(struct sk_buff **pskb)
321 {
322 unsigned int hash, repl_hash;
323 struct ip_conntrack *ct;
324 enum ip_conntrack_info ctinfo;
325
326 ct = ip_conntrack_get(*pskb, &ctinfo);
327
328 /* ipt_REJECT uses ip_conntrack_attach to attach related
329 ICMP/TCP RST packets in other direction. Actual packet
330 which created connection will be IP_CT_NEW or for an
331 expected connection, IP_CT_RELATED. */
332 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
333 return NF_ACCEPT;
334
335 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
336 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
337
338 /* We're not in hash table, and we refuse to set up related
339 connections for unconfirmed conns. But packet copies and
340 REJECT will give spurious warnings here. */
341 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
342
343 /* No external references means noone else could have
344 confirmed us. */
345 IP_NF_ASSERT(!is_confirmed(ct));
346 DEBUGP("Confirming conntrack %p\n", ct);
347
348 write_lock_bh(&ip_conntrack_lock);
349
350 /* See if there's one in the list already, including reverse:
351 NAT could have grabbed it without realizing, since we're
352 not in the hash. If there is, we lost race. */
353 if (!LIST_FIND(&ip_conntrack_hash[hash],
354 conntrack_tuple_cmp,
355 struct ip_conntrack_tuple_hash *,
356 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
357 && !LIST_FIND(&ip_conntrack_hash[repl_hash],
358 conntrack_tuple_cmp,
359 struct ip_conntrack_tuple_hash *,
360 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
361 /* Remove from unconfirmed list */
362 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
363
364 list_prepend(&ip_conntrack_hash[hash],
365 &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
366 list_prepend(&ip_conntrack_hash[repl_hash],
367 &ct->tuplehash[IP_CT_DIR_REPLY]);
368 /* Timer relative to confirmation time, not original
369 setting time, otherwise we'd get timer wrap in
370 weird delay cases. */
371 ct->timeout.expires += jiffies;
372 add_timer(&ct->timeout);
373 atomic_inc(&ct->ct_general.use);
374 set_bit(IPS_CONFIRMED_BIT, &ct->status);
375 CONNTRACK_STAT_INC(insert);
376 write_unlock_bh(&ip_conntrack_lock);
377 return NF_ACCEPT;
378 }
379
380 CONNTRACK_STAT_INC(insert_failed);
381 write_unlock_bh(&ip_conntrack_lock);
382
383 return NF_DROP;
384 }
385
386 /* Returns true if a connection correspondings to the tuple (required
387 for NAT). */
388 int
389 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
390 const struct ip_conntrack *ignored_conntrack)
391 {
392 struct ip_conntrack_tuple_hash *h;
393
394 read_lock_bh(&ip_conntrack_lock);
395 h = __ip_conntrack_find(tuple, ignored_conntrack);
396 read_unlock_bh(&ip_conntrack_lock);
397
398 return h != NULL;
399 }
400
401 /* There's a small race here where we may free a just-assured
402 connection. Too bad: we're in trouble anyway. */
403 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
404 {
405 return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
406 }
407
408 static int early_drop(struct list_head *chain)
409 {
410 /* Traverse backwards: gives us oldest, which is roughly LRU */
411 struct ip_conntrack_tuple_hash *h;
412 struct ip_conntrack *ct = NULL;
413 int dropped = 0;
414
415 read_lock_bh(&ip_conntrack_lock);
416 h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
417 if (h) {
418 ct = tuplehash_to_ctrack(h);
419 atomic_inc(&ct->ct_general.use);
420 }
421 read_unlock_bh(&ip_conntrack_lock);
422
423 if (!ct)
424 return dropped;
425
426 if (del_timer(&ct->timeout)) {
427 death_by_timeout((unsigned long)ct);
428 dropped = 1;
429 CONNTRACK_STAT_INC(early_drop);
430 }
431 ip_conntrack_put(ct);
432 return dropped;
433 }
434
435 static inline int helper_cmp(const struct ip_conntrack_helper *i,
436 const struct ip_conntrack_tuple *rtuple)
437 {
438 return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
439 }
440
441 static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
442 {
443 return LIST_FIND(&helpers, helper_cmp,
444 struct ip_conntrack_helper *,
445 tuple);
446 }
447
448 /* Allocate a new conntrack: we return -ENOMEM if classification
449 failed due to stress. Otherwise it really is unclassifiable. */
450 static struct ip_conntrack_tuple_hash *
451 init_conntrack(const struct ip_conntrack_tuple *tuple,
452 struct ip_conntrack_protocol *protocol,
453 struct sk_buff *skb)
454 {
455 struct ip_conntrack *conntrack;
456 struct ip_conntrack_tuple repl_tuple;
457 size_t hash;
458 struct ip_conntrack_expect *exp;
459
460 if (!ip_conntrack_hash_rnd_initted) {
461 get_random_bytes(&ip_conntrack_hash_rnd, 4);
462 ip_conntrack_hash_rnd_initted = 1;
463 }
464
465 hash = hash_conntrack(tuple);
466
467 if (ip_conntrack_max
468 && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
469 /* Try dropping from this hash chain. */
470 if (!early_drop(&ip_conntrack_hash[hash])) {
471 if (net_ratelimit())
472 printk(KERN_WARNING
473 "ip_conntrack: table full, dropping"
474 " packet.\n");
475 return ERR_PTR(-ENOMEM);
476 }
477 }
478
479 if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
480 DEBUGP("Can't invert tuple.\n");
481 return NULL;
482 }
483
484 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
485 if (!conntrack) {
486 DEBUGP("Can't allocate conntrack.\n");
487 return ERR_PTR(-ENOMEM);
488 }
489
490 memset(conntrack, 0, sizeof(*conntrack));
491 atomic_set(&conntrack->ct_general.use, 1);
492 conntrack->ct_general.destroy = destroy_conntrack;
493 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
494 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
495 if (!protocol->new(conntrack, skb)) {
496 kmem_cache_free(ip_conntrack_cachep, conntrack);
497 return NULL;
498 }
499 /* Don't set timer yet: wait for confirmation */
500 init_timer(&conntrack->timeout);
501 conntrack->timeout.data = (unsigned long)conntrack;
502 conntrack->timeout.function = death_by_timeout;
503
504 write_lock_bh(&ip_conntrack_lock);
505 exp = find_expectation(tuple);
506
507 if (exp) {
508 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
509 conntrack, exp);
510 /* Welcome, Mr. Bond. We've been expecting you... */
511 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
512 conntrack->master = exp->master;
513 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
514 conntrack->mark = exp->master->mark;
515 #endif
516 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
517 defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
518 /* this is ugly, but there is no other place where to put it */
519 conntrack->nat.masq_index = exp->master->nat.masq_index;
520 #endif
521 nf_conntrack_get(&conntrack->master->ct_general);
522 CONNTRACK_STAT_INC(expect_new);
523 } else {
524 conntrack->helper = ip_ct_find_helper(&repl_tuple);
525
526 CONNTRACK_STAT_INC(new);
527 }
528
529 /* Overload tuple linked list to put us in unconfirmed list. */
530 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
531
532 atomic_inc(&ip_conntrack_count);
533 write_unlock_bh(&ip_conntrack_lock);
534
535 if (exp) {
536 if (exp->expectfn)
537 exp->expectfn(conntrack, exp);
538 ip_conntrack_expect_put(exp);
539 }
540
541 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
542 }
543
544 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
545 static inline struct ip_conntrack *
546 resolve_normal_ct(struct sk_buff *skb,
547 struct ip_conntrack_protocol *proto,
548 int *set_reply,
549 unsigned int hooknum,
550 enum ip_conntrack_info *ctinfo)
551 {
552 struct ip_conntrack_tuple tuple;
553 struct ip_conntrack_tuple_hash *h;
554 struct ip_conntrack *ct;
555
556 IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
557
558 if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
559 &tuple,proto))
560 return NULL;
561
562 /* look for tuple match */
563 h = ip_conntrack_find_get(&tuple, NULL);
564 if (!h) {
565 h = init_conntrack(&tuple, proto, skb);
566 if (!h)
567 return NULL;
568 if (IS_ERR(h))
569 return (void *)h;
570 }
571 ct = tuplehash_to_ctrack(h);
572
573 /* It exists; we have (non-exclusive) reference. */
574 if (DIRECTION(h) == IP_CT_DIR_REPLY) {
575 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
576 /* Please set reply bit if this packet OK */
577 *set_reply = 1;
578 } else {
579 /* Once we've had two way comms, always ESTABLISHED. */
580 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
581 DEBUGP("ip_conntrack_in: normal packet for %p\n",
582 ct);
583 *ctinfo = IP_CT_ESTABLISHED;
584 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
585 DEBUGP("ip_conntrack_in: related packet for %p\n",
586 ct);
587 *ctinfo = IP_CT_RELATED;
588 } else {
589 DEBUGP("ip_conntrack_in: new packet for %p\n",
590 ct);
591 *ctinfo = IP_CT_NEW;
592 }
593 *set_reply = 0;
594 }
595 skb->nfct = &ct->ct_general;
596 skb->nfctinfo = *ctinfo;
597 return ct;
598 }
599
600 /* Netfilter hook itself. */
601 unsigned int ip_conntrack_in(unsigned int hooknum,
602 struct sk_buff **pskb,
603 const struct net_device *in,
604 const struct net_device *out,
605 int (*okfn)(struct sk_buff *))
606 {
607 struct ip_conntrack *ct;
608 enum ip_conntrack_info ctinfo;
609 struct ip_conntrack_protocol *proto;
610 int set_reply;
611 int ret;
612
613 /* Previously seen (loopback or untracked)? Ignore. */
614 if ((*pskb)->nfct) {
615 CONNTRACK_STAT_INC(ignore);
616 return NF_ACCEPT;
617 }
618
619 /* Never happen */
620 if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
621 if (net_ratelimit()) {
622 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
623 (*pskb)->nh.iph->protocol, hooknum);
624 }
625 return NF_DROP;
626 }
627
628 /* Doesn't cover locally-generated broadcast, so not worth it. */
629 #if 0
630 /* Ignore broadcast: no `connection'. */
631 if ((*pskb)->pkt_type == PACKET_BROADCAST) {
632 printk("Broadcast packet!\n");
633 return NF_ACCEPT;
634 } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
635 == htonl(0x000000FF)) {
636 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
637 NIPQUAD((*pskb)->nh.iph->saddr),
638 NIPQUAD((*pskb)->nh.iph->daddr),
639 (*pskb)->sk, (*pskb)->pkt_type);
640 }
641 #endif
642
643 proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
644
645 /* It may be an special packet, error, unclean...
646 * inverse of the return code tells to the netfilter
647 * core what to do with the packet. */
648 if (proto->error != NULL
649 && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
650 CONNTRACK_STAT_INC(error);
651 CONNTRACK_STAT_INC(invalid);
652 return -ret;
653 }
654
655 if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
656 /* Not valid part of a connection */
657 CONNTRACK_STAT_INC(invalid);
658 return NF_ACCEPT;
659 }
660
661 if (IS_ERR(ct)) {
662 /* Too stressed to deal. */
663 CONNTRACK_STAT_INC(drop);
664 return NF_DROP;
665 }
666
667 IP_NF_ASSERT((*pskb)->nfct);
668
669 ret = proto->packet(ct, *pskb, ctinfo);
670 if (ret < 0) {
671 /* Invalid: inverse of the return code tells
672 * the netfilter core what to do*/
673 nf_conntrack_put((*pskb)->nfct);
674 (*pskb)->nfct = NULL;
675 CONNTRACK_STAT_INC(invalid);
676 return -ret;
677 }
678
679 if (set_reply)
680 set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
681
682 return ret;
683 }
684
685 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
686 const struct ip_conntrack_tuple *orig)
687 {
688 return ip_ct_invert_tuple(inverse, orig,
689 ip_ct_find_proto(orig->dst.protonum));
690 }
691
692 /* Would two expected things clash? */
693 static inline int expect_clash(const struct ip_conntrack_expect *a,
694 const struct ip_conntrack_expect *b)
695 {
696 /* Part covered by intersection of masks must be unequal,
697 otherwise they clash */
698 struct ip_conntrack_tuple intersect_mask
699 = { { a->mask.src.ip & b->mask.src.ip,
700 { a->mask.src.u.all & b->mask.src.u.all } },
701 { a->mask.dst.ip & b->mask.dst.ip,
702 { a->mask.dst.u.all & b->mask.dst.u.all },
703 a->mask.dst.protonum & b->mask.dst.protonum } };
704
705 return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
706 }
707
708 static inline int expect_matches(const struct ip_conntrack_expect *a,
709 const struct ip_conntrack_expect *b)
710 {
711 return a->master == b->master
712 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
713 && ip_ct_tuple_equal(&a->mask, &b->mask);
714 }
715
716 /* Generally a bad idea to call this: could have matched already. */
717 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
718 {
719 struct ip_conntrack_expect *i;
720
721 write_lock_bh(&ip_conntrack_lock);
722 /* choose the the oldest expectation to evict */
723 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
724 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
725 unlink_expect(i);
726 write_unlock_bh(&ip_conntrack_lock);
727 ip_conntrack_expect_put(i);
728 return;
729 }
730 }
731 write_unlock_bh(&ip_conntrack_lock);
732 }
733
734 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
735 {
736 struct ip_conntrack_expect *new;
737
738 new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
739 if (!new) {
740 DEBUGP("expect_related: OOM allocating expect\n");
741 return NULL;
742 }
743 new->master = me;
744 atomic_inc(&new->master->ct_general.use);
745 atomic_set(&new->use, 1);
746 return new;
747 }
748
749 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
750 {
751 if (atomic_dec_and_test(&exp->use)) {
752 ip_conntrack_put(exp->master);
753 kmem_cache_free(ip_conntrack_expect_cachep, exp);
754 }
755 }
756
757 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
758 {
759 atomic_inc(&exp->use);
760 exp->master->expecting++;
761 list_add(&exp->list, &ip_conntrack_expect_list);
762
763 init_timer(&exp->timeout);
764 exp->timeout.data = (unsigned long)exp;
765 exp->timeout.function = expectation_timed_out;
766 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
767 add_timer(&exp->timeout);
768
769 CONNTRACK_STAT_INC(expect_create);
770 }
771
772 /* Race with expectations being used means we could have none to find; OK. */
773 static void evict_oldest_expect(struct ip_conntrack *master)
774 {
775 struct ip_conntrack_expect *i;
776
777 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
778 if (i->master == master) {
779 if (del_timer(&i->timeout)) {
780 unlink_expect(i);
781 ip_conntrack_expect_put(i);
782 }
783 break;
784 }
785 }
786 }
787
788 static inline int refresh_timer(struct ip_conntrack_expect *i)
789 {
790 if (!del_timer(&i->timeout))
791 return 0;
792
793 i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
794 add_timer(&i->timeout);
795 return 1;
796 }
797
798 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
799 {
800 struct ip_conntrack_expect *i;
801 int ret;
802
803 DEBUGP("ip_conntrack_expect_related %p\n", related_to);
804 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
805 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
806
807 write_lock_bh(&ip_conntrack_lock);
808 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
809 if (expect_matches(i, expect)) {
810 /* Refresh timer: if it's dying, ignore.. */
811 if (refresh_timer(i)) {
812 ret = 0;
813 goto out;
814 }
815 } else if (expect_clash(i, expect)) {
816 ret = -EBUSY;
817 goto out;
818 }
819 }
820
821 /* Will be over limit? */
822 if (expect->master->helper->max_expected &&
823 expect->master->expecting >= expect->master->helper->max_expected)
824 evict_oldest_expect(expect->master);
825
826 ip_conntrack_expect_insert(expect);
827 ret = 0;
828 out:
829 write_unlock_bh(&ip_conntrack_lock);
830 return ret;
831 }
832
833 /* Alter reply tuple (maybe alter helper). This is for NAT, and is
834 implicitly racy: see __ip_conntrack_confirm */
835 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
836 const struct ip_conntrack_tuple *newreply)
837 {
838 write_lock_bh(&ip_conntrack_lock);
839 /* Should be unconfirmed, so not in hash table yet */
840 IP_NF_ASSERT(!is_confirmed(conntrack));
841
842 DEBUGP("Altering reply tuple of %p to ", conntrack);
843 DUMP_TUPLE(newreply);
844
845 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
846 if (!conntrack->master && conntrack->expecting == 0)
847 conntrack->helper = ip_ct_find_helper(newreply);
848 write_unlock_bh(&ip_conntrack_lock);
849 }
850
851 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
852 {
853 BUG_ON(me->timeout == 0);
854 write_lock_bh(&ip_conntrack_lock);
855 list_prepend(&helpers, me);
856 write_unlock_bh(&ip_conntrack_lock);
857
858 return 0;
859 }
860
861 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
862 const struct ip_conntrack_helper *me)
863 {
864 if (tuplehash_to_ctrack(i)->helper == me)
865 tuplehash_to_ctrack(i)->helper = NULL;
866 return 0;
867 }
868
869 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
870 {
871 unsigned int i;
872 struct ip_conntrack_expect *exp, *tmp;
873
874 /* Need write lock here, to delete helper. */
875 write_lock_bh(&ip_conntrack_lock);
876 LIST_DELETE(&helpers, me);
877
878 /* Get rid of expectations */
879 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
880 if (exp->master->helper == me && del_timer(&exp->timeout)) {
881 unlink_expect(exp);
882 ip_conntrack_expect_put(exp);
883 }
884 }
885 /* Get rid of expecteds, set helpers to NULL. */
886 LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
887 for (i = 0; i < ip_conntrack_htable_size; i++)
888 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
889 struct ip_conntrack_tuple_hash *, me);
890 write_unlock_bh(&ip_conntrack_lock);
891
892 /* Someone could be still looking at the helper in a bh. */
893 synchronize_net();
894 }
895
896 static inline void ct_add_counters(struct ip_conntrack *ct,
897 enum ip_conntrack_info ctinfo,
898 const struct sk_buff *skb)
899 {
900 #ifdef CONFIG_IP_NF_CT_ACCT
901 if (skb) {
902 ct->counters[CTINFO2DIR(ctinfo)].packets++;
903 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
904 ntohs(skb->nh.iph->tot_len);
905 }
906 #endif
907 }
908
909 /* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
910 void ip_ct_refresh_acct(struct ip_conntrack *ct,
911 enum ip_conntrack_info ctinfo,
912 const struct sk_buff *skb,
913 unsigned long extra_jiffies)
914 {
915 IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
916
917 /* If not in hash table, timer will not be active yet */
918 if (!is_confirmed(ct)) {
919 ct->timeout.expires = extra_jiffies;
920 ct_add_counters(ct, ctinfo, skb);
921 } else {
922 write_lock_bh(&ip_conntrack_lock);
923 /* Need del_timer for race avoidance (may already be dying). */
924 if (del_timer(&ct->timeout)) {
925 ct->timeout.expires = jiffies + extra_jiffies;
926 add_timer(&ct->timeout);
927 }
928 ct_add_counters(ct, ctinfo, skb);
929 write_unlock_bh(&ip_conntrack_lock);
930 }
931 }
932
933 /* Returns new sk_buff, or NULL */
934 struct sk_buff *
935 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
936 {
937 skb_orphan(skb);
938
939 local_bh_disable();
940 skb = ip_defrag(skb, user);
941 local_bh_enable();
942
943 if (skb)
944 ip_send_check(skb->nh.iph);
945 return skb;
946 }
947
948 /* Used by ipt_REJECT. */
949 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
950 {
951 struct ip_conntrack *ct;
952 enum ip_conntrack_info ctinfo;
953
954 /* This ICMP is in reverse direction to the packet which caused it */
955 ct = ip_conntrack_get(skb, &ctinfo);
956
957 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
958 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
959 else
960 ctinfo = IP_CT_RELATED;
961
962 /* Attach to new skbuff, and increment count */
963 nskb->nfct = &ct->ct_general;
964 nskb->nfctinfo = ctinfo;
965 nf_conntrack_get(nskb->nfct);
966 }
967
968 static inline int
969 do_iter(const struct ip_conntrack_tuple_hash *i,
970 int (*iter)(struct ip_conntrack *i, void *data),
971 void *data)
972 {
973 return iter(tuplehash_to_ctrack(i), data);
974 }
975
976 /* Bring out ya dead! */
977 static struct ip_conntrack_tuple_hash *
978 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
979 void *data, unsigned int *bucket)
980 {
981 struct ip_conntrack_tuple_hash *h = NULL;
982
983 write_lock_bh(&ip_conntrack_lock);
984 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
985 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
986 struct ip_conntrack_tuple_hash *, iter, data);
987 if (h)
988 break;
989 }
990 if (!h)
991 h = LIST_FIND_W(&unconfirmed, do_iter,
992 struct ip_conntrack_tuple_hash *, iter, data);
993 if (h)
994 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
995 write_unlock_bh(&ip_conntrack_lock);
996
997 return h;
998 }
999
1000 void
1001 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1002 {
1003 struct ip_conntrack_tuple_hash *h;
1004 unsigned int bucket = 0;
1005
1006 while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1007 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1008 /* Time to push up daises... */
1009 if (del_timer(&ct->timeout))
1010 death_by_timeout((unsigned long)ct);
1011 /* ... else the timer will get him soon. */
1012
1013 ip_conntrack_put(ct);
1014 }
1015 }
1016
1017 /* Fast function for those who don't want to parse /proc (and I don't
1018 blame them). */
1019 /* Reversing the socket's dst/src point of view gives us the reply
1020 mapping. */
1021 static int
1022 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1023 {
1024 struct inet_sock *inet = inet_sk(sk);
1025 struct ip_conntrack_tuple_hash *h;
1026 struct ip_conntrack_tuple tuple;
1027
1028 IP_CT_TUPLE_U_BLANK(&tuple);
1029 tuple.src.ip = inet->rcv_saddr;
1030 tuple.src.u.tcp.port = inet->sport;
1031 tuple.dst.ip = inet->daddr;
1032 tuple.dst.u.tcp.port = inet->dport;
1033 tuple.dst.protonum = IPPROTO_TCP;
1034
1035 /* We only do TCP at the moment: is there a better way? */
1036 if (strcmp(sk->sk_prot->name, "TCP")) {
1037 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1038 return -ENOPROTOOPT;
1039 }
1040
1041 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1042 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1043 *len, sizeof(struct sockaddr_in));
1044 return -EINVAL;
1045 }
1046
1047 h = ip_conntrack_find_get(&tuple, NULL);
1048 if (h) {
1049 struct sockaddr_in sin;
1050 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1051
1052 sin.sin_family = AF_INET;
1053 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1054 .tuple.dst.u.tcp.port;
1055 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1056 .tuple.dst.ip;
1057
1058 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1059 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1060 ip_conntrack_put(ct);
1061 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1062 return -EFAULT;
1063 else
1064 return 0;
1065 }
1066 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1067 NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1068 NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1069 return -ENOENT;
1070 }
1071
1072 static struct nf_sockopt_ops so_getorigdst = {
1073 .pf = PF_INET,
1074 .get_optmin = SO_ORIGINAL_DST,
1075 .get_optmax = SO_ORIGINAL_DST+1,
1076 .get = &getorigdst,
1077 };
1078
1079 static int kill_all(struct ip_conntrack *i, void *data)
1080 {
1081 return 1;
1082 }
1083
1084 static void free_conntrack_hash(void)
1085 {
1086 if (ip_conntrack_vmalloc)
1087 vfree(ip_conntrack_hash);
1088 else
1089 free_pages((unsigned long)ip_conntrack_hash,
1090 get_order(sizeof(struct list_head)
1091 * ip_conntrack_htable_size));
1092 }
1093
1094 /* Mishearing the voices in his head, our hero wonders how he's
1095 supposed to kill the mall. */
1096 void ip_conntrack_cleanup(void)
1097 {
1098 ip_ct_attach = NULL;
1099 /* This makes sure all current packets have passed through
1100 netfilter framework. Roll on, two-stage module
1101 delete... */
1102 synchronize_net();
1103
1104 i_see_dead_people:
1105 ip_ct_iterate_cleanup(kill_all, NULL);
1106 if (atomic_read(&ip_conntrack_count) != 0) {
1107 schedule();
1108 goto i_see_dead_people;
1109 }
1110 /* wait until all references to ip_conntrack_untracked are dropped */
1111 while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1112 schedule();
1113
1114 kmem_cache_destroy(ip_conntrack_cachep);
1115 kmem_cache_destroy(ip_conntrack_expect_cachep);
1116 free_conntrack_hash();
1117 nf_unregister_sockopt(&so_getorigdst);
1118 }
1119
1120 static int hashsize;
1121 module_param(hashsize, int, 0400);
1122
1123 int __init ip_conntrack_init(void)
1124 {
1125 unsigned int i;
1126 int ret;
1127
1128 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1129 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1130 if (hashsize) {
1131 ip_conntrack_htable_size = hashsize;
1132 } else {
1133 ip_conntrack_htable_size
1134 = (((num_physpages << PAGE_SHIFT) / 16384)
1135 / sizeof(struct list_head));
1136 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1137 ip_conntrack_htable_size = 8192;
1138 if (ip_conntrack_htable_size < 16)
1139 ip_conntrack_htable_size = 16;
1140 }
1141 ip_conntrack_max = 8 * ip_conntrack_htable_size;
1142
1143 printk("ip_conntrack version %s (%u buckets, %d max)"
1144 " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1145 ip_conntrack_htable_size, ip_conntrack_max,
1146 sizeof(struct ip_conntrack));
1147
1148 ret = nf_register_sockopt(&so_getorigdst);
1149 if (ret != 0) {
1150 printk(KERN_ERR "Unable to register netfilter socket option\n");
1151 return ret;
1152 }
1153
1154 /* AK: the hash table is twice as big than needed because it
1155 uses list_head. it would be much nicer to caches to use a
1156 single pointer list head here. */
1157 ip_conntrack_vmalloc = 0;
1158 ip_conntrack_hash
1159 =(void*)__get_free_pages(GFP_KERNEL,
1160 get_order(sizeof(struct list_head)
1161 *ip_conntrack_htable_size));
1162 if (!ip_conntrack_hash) {
1163 ip_conntrack_vmalloc = 1;
1164 printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
1165 ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1166 * ip_conntrack_htable_size);
1167 }
1168 if (!ip_conntrack_hash) {
1169 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1170 goto err_unreg_sockopt;
1171 }
1172
1173 ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1174 sizeof(struct ip_conntrack), 0,
1175 0, NULL, NULL);
1176 if (!ip_conntrack_cachep) {
1177 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1178 goto err_free_hash;
1179 }
1180
1181 ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1182 sizeof(struct ip_conntrack_expect),
1183 0, 0, NULL, NULL);
1184 if (!ip_conntrack_expect_cachep) {
1185 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1186 goto err_free_conntrack_slab;
1187 }
1188
1189 /* Don't NEED lock here, but good form anyway. */
1190 write_lock_bh(&ip_conntrack_lock);
1191 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1192 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1193 /* Sew in builtin protocols. */
1194 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1195 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1196 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1197 write_unlock_bh(&ip_conntrack_lock);
1198
1199 for (i = 0; i < ip_conntrack_htable_size; i++)
1200 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1201
1202 /* For use by ipt_REJECT */
1203 ip_ct_attach = ip_conntrack_attach;
1204
1205 /* Set up fake conntrack:
1206 - to never be deleted, not in any hashes */
1207 atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1208 /* - and look it like as a confirmed connection */
1209 set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1210
1211 return ret;
1212
1213 err_free_conntrack_slab:
1214 kmem_cache_destroy(ip_conntrack_cachep);
1215 err_free_hash:
1216 free_conntrack_hash();
1217 err_unreg_sockopt:
1218 nf_unregister_sockopt(&so_getorigdst);
1219
1220 return -ENOMEM;
1221 }