]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blob - net/ipv4/netfilter/ip_conntrack_core.c
[PATCH] slab: remove kmem_cache_t
[mirror_ubuntu-artful-kernel.git] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 *
12 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13 * - new API and handling of conntrack/nat helpers
14 * - now capable of multiple expectations for one master
15 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16 * - add usage/reference counts to ip_conntrack_expect
17 * - export ip_conntrack[_expect]_{find_get,put} functions
18 * */
19
20 #include <linux/types.h>
21 #include <linux/icmp.h>
22 #include <linux/ip.h>
23 #include <linux/netfilter.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/module.h>
26 #include <linux/skbuff.h>
27 #include <linux/proc_fs.h>
28 #include <linux/vmalloc.h>
29 #include <net/checksum.h>
30 #include <net/ip.h>
31 #include <linux/stddef.h>
32 #include <linux/sysctl.h>
33 #include <linux/slab.h>
34 #include <linux/random.h>
35 #include <linux/jhash.h>
36 #include <linux/err.h>
37 #include <linux/percpu.h>
38 #include <linux/moduleparam.h>
39 #include <linux/notifier.h>
40
41 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
42 registrations, conntrack timers*/
43 #include <linux/netfilter_ipv4/ip_conntrack.h>
44 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
45 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
46 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
47
48 #define IP_CONNTRACK_VERSION "2.4"
49
50 #if 0
51 #define DEBUGP printk
52 #else
53 #define DEBUGP(format, args...)
54 #endif
55
56 DEFINE_RWLOCK(ip_conntrack_lock);
57
58 /* ip_conntrack_standalone needs this */
59 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
60
61 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
62 LIST_HEAD(ip_conntrack_expect_list);
63 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO] __read_mostly;
64 static LIST_HEAD(helpers);
65 unsigned int ip_conntrack_htable_size __read_mostly = 0;
66 int ip_conntrack_max __read_mostly;
67 struct list_head *ip_conntrack_hash __read_mostly;
68 static struct kmem_cache *ip_conntrack_cachep __read_mostly;
69 static struct kmem_cache *ip_conntrack_expect_cachep __read_mostly;
70 struct ip_conntrack ip_conntrack_untracked;
71 unsigned int ip_ct_log_invalid __read_mostly;
72 static LIST_HEAD(unconfirmed);
73 static int ip_conntrack_vmalloc __read_mostly;
74
75 static unsigned int ip_conntrack_next_id;
76 static unsigned int ip_conntrack_expect_next_id;
77 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
78 ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
79 ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
80
81 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
82
83 /* deliver cached events and clear cache entry - must be called with locally
84 * disabled softirqs */
85 static inline void
86 __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
87 {
88 DEBUGP("ecache: delivering events for %p\n", ecache->ct);
89 if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
90 atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events,
91 ecache->ct);
92 ecache->events = 0;
93 ip_conntrack_put(ecache->ct);
94 ecache->ct = NULL;
95 }
96
97 /* Deliver all cached events for a particular conntrack. This is called
98 * by code prior to async packet handling or freeing the skb */
99 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
100 {
101 struct ip_conntrack_ecache *ecache;
102
103 local_bh_disable();
104 ecache = &__get_cpu_var(ip_conntrack_ecache);
105 if (ecache->ct == ct)
106 __ip_ct_deliver_cached_events(ecache);
107 local_bh_enable();
108 }
109
110 void __ip_ct_event_cache_init(struct ip_conntrack *ct)
111 {
112 struct ip_conntrack_ecache *ecache;
113
114 /* take care of delivering potentially old events */
115 ecache = &__get_cpu_var(ip_conntrack_ecache);
116 BUG_ON(ecache->ct == ct);
117 if (ecache->ct)
118 __ip_ct_deliver_cached_events(ecache);
119 /* initialize for this conntrack/packet */
120 ecache->ct = ct;
121 nf_conntrack_get(&ct->ct_general);
122 }
123
124 /* flush the event cache - touches other CPU's data and must not be called while
125 * packets are still passing through the code */
126 static void ip_ct_event_cache_flush(void)
127 {
128 struct ip_conntrack_ecache *ecache;
129 int cpu;
130
131 for_each_possible_cpu(cpu) {
132 ecache = &per_cpu(ip_conntrack_ecache, cpu);
133 if (ecache->ct)
134 ip_conntrack_put(ecache->ct);
135 }
136 }
137 #else
138 static inline void ip_ct_event_cache_flush(void) {}
139 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
140
141 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
142
143 static int ip_conntrack_hash_rnd_initted;
144 static unsigned int ip_conntrack_hash_rnd;
145
146 static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
147 unsigned int size, unsigned int rnd)
148 {
149 return (jhash_3words((__force u32)tuple->src.ip,
150 ((__force u32)tuple->dst.ip ^ tuple->dst.protonum),
151 (tuple->src.u.all | (tuple->dst.u.all << 16)),
152 rnd) % size);
153 }
154
155 static u_int32_t
156 hash_conntrack(const struct ip_conntrack_tuple *tuple)
157 {
158 return __hash_conntrack(tuple, ip_conntrack_htable_size,
159 ip_conntrack_hash_rnd);
160 }
161
162 int
163 ip_ct_get_tuple(const struct iphdr *iph,
164 const struct sk_buff *skb,
165 unsigned int dataoff,
166 struct ip_conntrack_tuple *tuple,
167 const struct ip_conntrack_protocol *protocol)
168 {
169 /* Never happen */
170 if (iph->frag_off & htons(IP_OFFSET)) {
171 printk("ip_conntrack_core: Frag of proto %u.\n",
172 iph->protocol);
173 return 0;
174 }
175
176 tuple->src.ip = iph->saddr;
177 tuple->dst.ip = iph->daddr;
178 tuple->dst.protonum = iph->protocol;
179 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
180
181 return protocol->pkt_to_tuple(skb, dataoff, tuple);
182 }
183
184 int
185 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
186 const struct ip_conntrack_tuple *orig,
187 const struct ip_conntrack_protocol *protocol)
188 {
189 inverse->src.ip = orig->dst.ip;
190 inverse->dst.ip = orig->src.ip;
191 inverse->dst.protonum = orig->dst.protonum;
192 inverse->dst.dir = !orig->dst.dir;
193
194 return protocol->invert_tuple(inverse, orig);
195 }
196
197
198 /* ip_conntrack_expect helper functions */
199 void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
200 {
201 IP_NF_ASSERT(!timer_pending(&exp->timeout));
202 list_del(&exp->list);
203 CONNTRACK_STAT_INC(expect_delete);
204 exp->master->expecting--;
205 ip_conntrack_expect_put(exp);
206 }
207
208 static void expectation_timed_out(unsigned long ul_expect)
209 {
210 struct ip_conntrack_expect *exp = (void *)ul_expect;
211
212 write_lock_bh(&ip_conntrack_lock);
213 ip_ct_unlink_expect(exp);
214 write_unlock_bh(&ip_conntrack_lock);
215 ip_conntrack_expect_put(exp);
216 }
217
218 struct ip_conntrack_expect *
219 __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
220 {
221 struct ip_conntrack_expect *i;
222
223 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
224 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask))
225 return i;
226 }
227 return NULL;
228 }
229
230 /* Just find a expectation corresponding to a tuple. */
231 struct ip_conntrack_expect *
232 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
233 {
234 struct ip_conntrack_expect *i;
235
236 read_lock_bh(&ip_conntrack_lock);
237 i = __ip_conntrack_expect_find(tuple);
238 if (i)
239 atomic_inc(&i->use);
240 read_unlock_bh(&ip_conntrack_lock);
241
242 return i;
243 }
244
245 /* If an expectation for this connection is found, it gets delete from
246 * global list then returned. */
247 static struct ip_conntrack_expect *
248 find_expectation(const struct ip_conntrack_tuple *tuple)
249 {
250 struct ip_conntrack_expect *i;
251
252 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
253 /* If master is not in hash table yet (ie. packet hasn't left
254 this machine yet), how can other end know about expected?
255 Hence these are not the droids you are looking for (if
256 master ct never got confirmed, we'd hold a reference to it
257 and weird things would happen to future packets). */
258 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
259 && is_confirmed(i->master)) {
260 if (i->flags & IP_CT_EXPECT_PERMANENT) {
261 atomic_inc(&i->use);
262 return i;
263 } else if (del_timer(&i->timeout)) {
264 ip_ct_unlink_expect(i);
265 return i;
266 }
267 }
268 }
269 return NULL;
270 }
271
272 /* delete all expectations for this conntrack */
273 void ip_ct_remove_expectations(struct ip_conntrack *ct)
274 {
275 struct ip_conntrack_expect *i, *tmp;
276
277 /* Optimization: most connection never expect any others. */
278 if (ct->expecting == 0)
279 return;
280
281 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
282 if (i->master == ct && del_timer(&i->timeout)) {
283 ip_ct_unlink_expect(i);
284 ip_conntrack_expect_put(i);
285 }
286 }
287 }
288
289 static void
290 clean_from_lists(struct ip_conntrack *ct)
291 {
292 DEBUGP("clean_from_lists(%p)\n", ct);
293 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
294 list_del(&ct->tuplehash[IP_CT_DIR_REPLY].list);
295
296 /* Destroy all pending expectations */
297 ip_ct_remove_expectations(ct);
298 }
299
300 static void
301 destroy_conntrack(struct nf_conntrack *nfct)
302 {
303 struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
304 struct ip_conntrack_protocol *proto;
305 struct ip_conntrack_helper *helper;
306
307 DEBUGP("destroy_conntrack(%p)\n", ct);
308 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
309 IP_NF_ASSERT(!timer_pending(&ct->timeout));
310
311 ip_conntrack_event(IPCT_DESTROY, ct);
312 set_bit(IPS_DYING_BIT, &ct->status);
313
314 helper = ct->helper;
315 if (helper && helper->destroy)
316 helper->destroy(ct);
317
318 /* To make sure we don't get any weird locking issues here:
319 * destroy_conntrack() MUST NOT be called with a write lock
320 * to ip_conntrack_lock!!! -HW */
321 proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
322 if (proto && proto->destroy)
323 proto->destroy(ct);
324
325 if (ip_conntrack_destroyed)
326 ip_conntrack_destroyed(ct);
327
328 write_lock_bh(&ip_conntrack_lock);
329 /* Expectations will have been removed in clean_from_lists,
330 * except TFTP can create an expectation on the first packet,
331 * before connection is in the list, so we need to clean here,
332 * too. */
333 ip_ct_remove_expectations(ct);
334
335 /* We overload first tuple to link into unconfirmed list. */
336 if (!is_confirmed(ct)) {
337 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
338 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
339 }
340
341 CONNTRACK_STAT_INC(delete);
342 write_unlock_bh(&ip_conntrack_lock);
343
344 if (ct->master)
345 ip_conntrack_put(ct->master);
346
347 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
348 ip_conntrack_free(ct);
349 }
350
351 static void death_by_timeout(unsigned long ul_conntrack)
352 {
353 struct ip_conntrack *ct = (void *)ul_conntrack;
354
355 write_lock_bh(&ip_conntrack_lock);
356 /* Inside lock so preempt is disabled on module removal path.
357 * Otherwise we can get spurious warnings. */
358 CONNTRACK_STAT_INC(delete_list);
359 clean_from_lists(ct);
360 write_unlock_bh(&ip_conntrack_lock);
361 ip_conntrack_put(ct);
362 }
363
364 struct ip_conntrack_tuple_hash *
365 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
366 const struct ip_conntrack *ignored_conntrack)
367 {
368 struct ip_conntrack_tuple_hash *h;
369 unsigned int hash = hash_conntrack(tuple);
370
371 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
372 if (tuplehash_to_ctrack(h) != ignored_conntrack &&
373 ip_ct_tuple_equal(tuple, &h->tuple)) {
374 CONNTRACK_STAT_INC(found);
375 return h;
376 }
377 CONNTRACK_STAT_INC(searched);
378 }
379
380 return NULL;
381 }
382
383 /* Find a connection corresponding to a tuple. */
384 struct ip_conntrack_tuple_hash *
385 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
386 const struct ip_conntrack *ignored_conntrack)
387 {
388 struct ip_conntrack_tuple_hash *h;
389
390 read_lock_bh(&ip_conntrack_lock);
391 h = __ip_conntrack_find(tuple, ignored_conntrack);
392 if (h)
393 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
394 read_unlock_bh(&ip_conntrack_lock);
395
396 return h;
397 }
398
399 static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
400 unsigned int hash,
401 unsigned int repl_hash)
402 {
403 ct->id = ++ip_conntrack_next_id;
404 list_add(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list,
405 &ip_conntrack_hash[hash]);
406 list_add(&ct->tuplehash[IP_CT_DIR_REPLY].list,
407 &ip_conntrack_hash[repl_hash]);
408 }
409
410 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
411 {
412 unsigned int hash, repl_hash;
413
414 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
415 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
416
417 write_lock_bh(&ip_conntrack_lock);
418 __ip_conntrack_hash_insert(ct, hash, repl_hash);
419 write_unlock_bh(&ip_conntrack_lock);
420 }
421
422 /* Confirm a connection given skb; places it in hash table */
423 int
424 __ip_conntrack_confirm(struct sk_buff **pskb)
425 {
426 unsigned int hash, repl_hash;
427 struct ip_conntrack_tuple_hash *h;
428 struct ip_conntrack *ct;
429 enum ip_conntrack_info ctinfo;
430
431 ct = ip_conntrack_get(*pskb, &ctinfo);
432
433 /* ipt_REJECT uses ip_conntrack_attach to attach related
434 ICMP/TCP RST packets in other direction. Actual packet
435 which created connection will be IP_CT_NEW or for an
436 expected connection, IP_CT_RELATED. */
437 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
438 return NF_ACCEPT;
439
440 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
441 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
442
443 /* We're not in hash table, and we refuse to set up related
444 connections for unconfirmed conns. But packet copies and
445 REJECT will give spurious warnings here. */
446 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
447
448 /* No external references means noone else could have
449 confirmed us. */
450 IP_NF_ASSERT(!is_confirmed(ct));
451 DEBUGP("Confirming conntrack %p\n", ct);
452
453 write_lock_bh(&ip_conntrack_lock);
454
455 /* See if there's one in the list already, including reverse:
456 NAT could have grabbed it without realizing, since we're
457 not in the hash. If there is, we lost race. */
458 list_for_each_entry(h, &ip_conntrack_hash[hash], list)
459 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
460 &h->tuple))
461 goto out;
462 list_for_each_entry(h, &ip_conntrack_hash[repl_hash], list)
463 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
464 &h->tuple))
465 goto out;
466
467 /* Remove from unconfirmed list */
468 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
469
470 __ip_conntrack_hash_insert(ct, hash, repl_hash);
471 /* Timer relative to confirmation time, not original
472 setting time, otherwise we'd get timer wrap in
473 weird delay cases. */
474 ct->timeout.expires += jiffies;
475 add_timer(&ct->timeout);
476 atomic_inc(&ct->ct_general.use);
477 set_bit(IPS_CONFIRMED_BIT, &ct->status);
478 CONNTRACK_STAT_INC(insert);
479 write_unlock_bh(&ip_conntrack_lock);
480 if (ct->helper)
481 ip_conntrack_event_cache(IPCT_HELPER, *pskb);
482 #ifdef CONFIG_IP_NF_NAT_NEEDED
483 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
484 test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
485 ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
486 #endif
487 ip_conntrack_event_cache(master_ct(ct) ?
488 IPCT_RELATED : IPCT_NEW, *pskb);
489
490 return NF_ACCEPT;
491
492 out:
493 CONNTRACK_STAT_INC(insert_failed);
494 write_unlock_bh(&ip_conntrack_lock);
495 return NF_DROP;
496 }
497
498 /* Returns true if a connection correspondings to the tuple (required
499 for NAT). */
500 int
501 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
502 const struct ip_conntrack *ignored_conntrack)
503 {
504 struct ip_conntrack_tuple_hash *h;
505
506 read_lock_bh(&ip_conntrack_lock);
507 h = __ip_conntrack_find(tuple, ignored_conntrack);
508 read_unlock_bh(&ip_conntrack_lock);
509
510 return h != NULL;
511 }
512
513 /* There's a small race here where we may free a just-assured
514 connection. Too bad: we're in trouble anyway. */
515 static int early_drop(struct list_head *chain)
516 {
517 /* Traverse backwards: gives us oldest, which is roughly LRU */
518 struct ip_conntrack_tuple_hash *h;
519 struct ip_conntrack *ct = NULL, *tmp;
520 int dropped = 0;
521
522 read_lock_bh(&ip_conntrack_lock);
523 list_for_each_entry_reverse(h, chain, list) {
524 tmp = tuplehash_to_ctrack(h);
525 if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) {
526 ct = tmp;
527 atomic_inc(&ct->ct_general.use);
528 break;
529 }
530 }
531 read_unlock_bh(&ip_conntrack_lock);
532
533 if (!ct)
534 return dropped;
535
536 if (del_timer(&ct->timeout)) {
537 death_by_timeout((unsigned long)ct);
538 dropped = 1;
539 CONNTRACK_STAT_INC(early_drop);
540 }
541 ip_conntrack_put(ct);
542 return dropped;
543 }
544
545 static struct ip_conntrack_helper *
546 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
547 {
548 struct ip_conntrack_helper *h;
549
550 list_for_each_entry(h, &helpers, list) {
551 if (ip_ct_tuple_mask_cmp(tuple, &h->tuple, &h->mask))
552 return h;
553 }
554 return NULL;
555 }
556
557 struct ip_conntrack_helper *
558 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
559 {
560 struct ip_conntrack_helper *helper;
561
562 /* need ip_conntrack_lock to assure that helper exists until
563 * try_module_get() is called */
564 read_lock_bh(&ip_conntrack_lock);
565
566 helper = __ip_conntrack_helper_find(tuple);
567 if (helper) {
568 /* need to increase module usage count to assure helper will
569 * not go away while the caller is e.g. busy putting a
570 * conntrack in the hash that uses the helper */
571 if (!try_module_get(helper->me))
572 helper = NULL;
573 }
574
575 read_unlock_bh(&ip_conntrack_lock);
576
577 return helper;
578 }
579
580 void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
581 {
582 module_put(helper->me);
583 }
584
585 struct ip_conntrack_protocol *
586 __ip_conntrack_proto_find(u_int8_t protocol)
587 {
588 return ip_ct_protos[protocol];
589 }
590
591 /* this is guaranteed to always return a valid protocol helper, since
592 * it falls back to generic_protocol */
593 struct ip_conntrack_protocol *
594 ip_conntrack_proto_find_get(u_int8_t protocol)
595 {
596 struct ip_conntrack_protocol *p;
597
598 preempt_disable();
599 p = __ip_conntrack_proto_find(protocol);
600 if (p) {
601 if (!try_module_get(p->me))
602 p = &ip_conntrack_generic_protocol;
603 }
604 preempt_enable();
605
606 return p;
607 }
608
609 void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
610 {
611 module_put(p->me);
612 }
613
614 struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
615 struct ip_conntrack_tuple *repl)
616 {
617 struct ip_conntrack *conntrack;
618
619 if (!ip_conntrack_hash_rnd_initted) {
620 get_random_bytes(&ip_conntrack_hash_rnd, 4);
621 ip_conntrack_hash_rnd_initted = 1;
622 }
623
624 /* We don't want any race condition at early drop stage */
625 atomic_inc(&ip_conntrack_count);
626
627 if (ip_conntrack_max
628 && atomic_read(&ip_conntrack_count) > ip_conntrack_max) {
629 unsigned int hash = hash_conntrack(orig);
630 /* Try dropping from this hash chain. */
631 if (!early_drop(&ip_conntrack_hash[hash])) {
632 atomic_dec(&ip_conntrack_count);
633 if (net_ratelimit())
634 printk(KERN_WARNING
635 "ip_conntrack: table full, dropping"
636 " packet.\n");
637 return ERR_PTR(-ENOMEM);
638 }
639 }
640
641 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
642 if (!conntrack) {
643 DEBUGP("Can't allocate conntrack.\n");
644 atomic_dec(&ip_conntrack_count);
645 return ERR_PTR(-ENOMEM);
646 }
647
648 memset(conntrack, 0, sizeof(*conntrack));
649 atomic_set(&conntrack->ct_general.use, 1);
650 conntrack->ct_general.destroy = destroy_conntrack;
651 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
652 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
653 /* Don't set timer yet: wait for confirmation */
654 init_timer(&conntrack->timeout);
655 conntrack->timeout.data = (unsigned long)conntrack;
656 conntrack->timeout.function = death_by_timeout;
657
658 return conntrack;
659 }
660
661 void
662 ip_conntrack_free(struct ip_conntrack *conntrack)
663 {
664 atomic_dec(&ip_conntrack_count);
665 kmem_cache_free(ip_conntrack_cachep, conntrack);
666 }
667
668 /* Allocate a new conntrack: we return -ENOMEM if classification
669 * failed due to stress. Otherwise it really is unclassifiable */
670 static struct ip_conntrack_tuple_hash *
671 init_conntrack(struct ip_conntrack_tuple *tuple,
672 struct ip_conntrack_protocol *protocol,
673 struct sk_buff *skb)
674 {
675 struct ip_conntrack *conntrack;
676 struct ip_conntrack_tuple repl_tuple;
677 struct ip_conntrack_expect *exp;
678
679 if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
680 DEBUGP("Can't invert tuple.\n");
681 return NULL;
682 }
683
684 conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
685 if (conntrack == NULL || IS_ERR(conntrack))
686 return (struct ip_conntrack_tuple_hash *)conntrack;
687
688 if (!protocol->new(conntrack, skb)) {
689 ip_conntrack_free(conntrack);
690 return NULL;
691 }
692
693 write_lock_bh(&ip_conntrack_lock);
694 exp = find_expectation(tuple);
695
696 if (exp) {
697 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
698 conntrack, exp);
699 /* Welcome, Mr. Bond. We've been expecting you... */
700 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
701 conntrack->master = exp->master;
702 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
703 conntrack->mark = exp->master->mark;
704 #endif
705 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
706 defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
707 /* this is ugly, but there is no other place where to put it */
708 conntrack->nat.masq_index = exp->master->nat.masq_index;
709 #endif
710 #ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
711 conntrack->secmark = exp->master->secmark;
712 #endif
713 nf_conntrack_get(&conntrack->master->ct_general);
714 CONNTRACK_STAT_INC(expect_new);
715 } else {
716 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
717
718 CONNTRACK_STAT_INC(new);
719 }
720
721 /* Overload tuple linked list to put us in unconfirmed list. */
722 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
723
724 write_unlock_bh(&ip_conntrack_lock);
725
726 if (exp) {
727 if (exp->expectfn)
728 exp->expectfn(conntrack, exp);
729 ip_conntrack_expect_put(exp);
730 }
731
732 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
733 }
734
735 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
736 static inline struct ip_conntrack *
737 resolve_normal_ct(struct sk_buff *skb,
738 struct ip_conntrack_protocol *proto,
739 int *set_reply,
740 unsigned int hooknum,
741 enum ip_conntrack_info *ctinfo)
742 {
743 struct ip_conntrack_tuple tuple;
744 struct ip_conntrack_tuple_hash *h;
745 struct ip_conntrack *ct;
746
747 IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
748
749 if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
750 &tuple,proto))
751 return NULL;
752
753 /* look for tuple match */
754 h = ip_conntrack_find_get(&tuple, NULL);
755 if (!h) {
756 h = init_conntrack(&tuple, proto, skb);
757 if (!h)
758 return NULL;
759 if (IS_ERR(h))
760 return (void *)h;
761 }
762 ct = tuplehash_to_ctrack(h);
763
764 /* It exists; we have (non-exclusive) reference. */
765 if (DIRECTION(h) == IP_CT_DIR_REPLY) {
766 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
767 /* Please set reply bit if this packet OK */
768 *set_reply = 1;
769 } else {
770 /* Once we've had two way comms, always ESTABLISHED. */
771 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
772 DEBUGP("ip_conntrack_in: normal packet for %p\n",
773 ct);
774 *ctinfo = IP_CT_ESTABLISHED;
775 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
776 DEBUGP("ip_conntrack_in: related packet for %p\n",
777 ct);
778 *ctinfo = IP_CT_RELATED;
779 } else {
780 DEBUGP("ip_conntrack_in: new packet for %p\n",
781 ct);
782 *ctinfo = IP_CT_NEW;
783 }
784 *set_reply = 0;
785 }
786 skb->nfct = &ct->ct_general;
787 skb->nfctinfo = *ctinfo;
788 return ct;
789 }
790
791 /* Netfilter hook itself. */
792 unsigned int ip_conntrack_in(unsigned int hooknum,
793 struct sk_buff **pskb,
794 const struct net_device *in,
795 const struct net_device *out,
796 int (*okfn)(struct sk_buff *))
797 {
798 struct ip_conntrack *ct;
799 enum ip_conntrack_info ctinfo;
800 struct ip_conntrack_protocol *proto;
801 int set_reply = 0;
802 int ret;
803
804 /* Previously seen (loopback or untracked)? Ignore. */
805 if ((*pskb)->nfct) {
806 CONNTRACK_STAT_INC(ignore);
807 return NF_ACCEPT;
808 }
809
810 /* Never happen */
811 if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
812 if (net_ratelimit()) {
813 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
814 (*pskb)->nh.iph->protocol, hooknum);
815 }
816 return NF_DROP;
817 }
818
819 /* Doesn't cover locally-generated broadcast, so not worth it. */
820 #if 0
821 /* Ignore broadcast: no `connection'. */
822 if ((*pskb)->pkt_type == PACKET_BROADCAST) {
823 printk("Broadcast packet!\n");
824 return NF_ACCEPT;
825 } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
826 == htonl(0x000000FF)) {
827 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
828 NIPQUAD((*pskb)->nh.iph->saddr),
829 NIPQUAD((*pskb)->nh.iph->daddr),
830 (*pskb)->sk, (*pskb)->pkt_type);
831 }
832 #endif
833
834 proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
835
836 /* It may be an special packet, error, unclean...
837 * inverse of the return code tells to the netfilter
838 * core what to do with the packet. */
839 if (proto->error != NULL
840 && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
841 CONNTRACK_STAT_INC(error);
842 CONNTRACK_STAT_INC(invalid);
843 return -ret;
844 }
845
846 if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
847 /* Not valid part of a connection */
848 CONNTRACK_STAT_INC(invalid);
849 return NF_ACCEPT;
850 }
851
852 if (IS_ERR(ct)) {
853 /* Too stressed to deal. */
854 CONNTRACK_STAT_INC(drop);
855 return NF_DROP;
856 }
857
858 IP_NF_ASSERT((*pskb)->nfct);
859
860 ret = proto->packet(ct, *pskb, ctinfo);
861 if (ret < 0) {
862 /* Invalid: inverse of the return code tells
863 * the netfilter core what to do*/
864 nf_conntrack_put((*pskb)->nfct);
865 (*pskb)->nfct = NULL;
866 CONNTRACK_STAT_INC(invalid);
867 return -ret;
868 }
869
870 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
871 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
872
873 return ret;
874 }
875
876 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
877 const struct ip_conntrack_tuple *orig)
878 {
879 return ip_ct_invert_tuple(inverse, orig,
880 __ip_conntrack_proto_find(orig->dst.protonum));
881 }
882
883 /* Would two expected things clash? */
884 static inline int expect_clash(const struct ip_conntrack_expect *a,
885 const struct ip_conntrack_expect *b)
886 {
887 /* Part covered by intersection of masks must be unequal,
888 otherwise they clash */
889 struct ip_conntrack_tuple intersect_mask
890 = { { a->mask.src.ip & b->mask.src.ip,
891 { a->mask.src.u.all & b->mask.src.u.all } },
892 { a->mask.dst.ip & b->mask.dst.ip,
893 { a->mask.dst.u.all & b->mask.dst.u.all },
894 a->mask.dst.protonum & b->mask.dst.protonum } };
895
896 return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
897 }
898
899 static inline int expect_matches(const struct ip_conntrack_expect *a,
900 const struct ip_conntrack_expect *b)
901 {
902 return a->master == b->master
903 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
904 && ip_ct_tuple_equal(&a->mask, &b->mask);
905 }
906
907 /* Generally a bad idea to call this: could have matched already. */
908 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
909 {
910 struct ip_conntrack_expect *i;
911
912 write_lock_bh(&ip_conntrack_lock);
913 /* choose the the oldest expectation to evict */
914 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
915 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
916 ip_ct_unlink_expect(i);
917 write_unlock_bh(&ip_conntrack_lock);
918 ip_conntrack_expect_put(i);
919 return;
920 }
921 }
922 write_unlock_bh(&ip_conntrack_lock);
923 }
924
925 /* We don't increase the master conntrack refcount for non-fulfilled
926 * conntracks. During the conntrack destruction, the expectations are
927 * always killed before the conntrack itself */
928 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
929 {
930 struct ip_conntrack_expect *new;
931
932 new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
933 if (!new) {
934 DEBUGP("expect_related: OOM allocating expect\n");
935 return NULL;
936 }
937 new->master = me;
938 atomic_set(&new->use, 1);
939 return new;
940 }
941
942 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
943 {
944 if (atomic_dec_and_test(&exp->use))
945 kmem_cache_free(ip_conntrack_expect_cachep, exp);
946 }
947
948 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
949 {
950 atomic_inc(&exp->use);
951 exp->master->expecting++;
952 list_add(&exp->list, &ip_conntrack_expect_list);
953
954 init_timer(&exp->timeout);
955 exp->timeout.data = (unsigned long)exp;
956 exp->timeout.function = expectation_timed_out;
957 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
958 add_timer(&exp->timeout);
959
960 exp->id = ++ip_conntrack_expect_next_id;
961 atomic_inc(&exp->use);
962 CONNTRACK_STAT_INC(expect_create);
963 }
964
965 /* Race with expectations being used means we could have none to find; OK. */
966 static void evict_oldest_expect(struct ip_conntrack *master)
967 {
968 struct ip_conntrack_expect *i;
969
970 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
971 if (i->master == master) {
972 if (del_timer(&i->timeout)) {
973 ip_ct_unlink_expect(i);
974 ip_conntrack_expect_put(i);
975 }
976 break;
977 }
978 }
979 }
980
981 static inline int refresh_timer(struct ip_conntrack_expect *i)
982 {
983 if (!del_timer(&i->timeout))
984 return 0;
985
986 i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
987 add_timer(&i->timeout);
988 return 1;
989 }
990
991 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
992 {
993 struct ip_conntrack_expect *i;
994 int ret;
995
996 DEBUGP("ip_conntrack_expect_related %p\n", related_to);
997 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
998 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
999
1000 write_lock_bh(&ip_conntrack_lock);
1001 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1002 if (expect_matches(i, expect)) {
1003 /* Refresh timer: if it's dying, ignore.. */
1004 if (refresh_timer(i)) {
1005 ret = 0;
1006 goto out;
1007 }
1008 } else if (expect_clash(i, expect)) {
1009 ret = -EBUSY;
1010 goto out;
1011 }
1012 }
1013
1014 /* Will be over limit? */
1015 if (expect->master->helper->max_expected &&
1016 expect->master->expecting >= expect->master->helper->max_expected)
1017 evict_oldest_expect(expect->master);
1018
1019 ip_conntrack_expect_insert(expect);
1020 ip_conntrack_expect_event(IPEXP_NEW, expect);
1021 ret = 0;
1022 out:
1023 write_unlock_bh(&ip_conntrack_lock);
1024 return ret;
1025 }
1026
1027 /* Alter reply tuple (maybe alter helper). This is for NAT, and is
1028 implicitly racy: see __ip_conntrack_confirm */
1029 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1030 const struct ip_conntrack_tuple *newreply)
1031 {
1032 write_lock_bh(&ip_conntrack_lock);
1033 /* Should be unconfirmed, so not in hash table yet */
1034 IP_NF_ASSERT(!is_confirmed(conntrack));
1035
1036 DEBUGP("Altering reply tuple of %p to ", conntrack);
1037 DUMP_TUPLE(newreply);
1038
1039 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1040 if (!conntrack->master && conntrack->expecting == 0)
1041 conntrack->helper = __ip_conntrack_helper_find(newreply);
1042 write_unlock_bh(&ip_conntrack_lock);
1043 }
1044
1045 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1046 {
1047 BUG_ON(me->timeout == 0);
1048 write_lock_bh(&ip_conntrack_lock);
1049 list_add(&me->list, &helpers);
1050 write_unlock_bh(&ip_conntrack_lock);
1051
1052 return 0;
1053 }
1054
1055 struct ip_conntrack_helper *
1056 __ip_conntrack_helper_find_byname(const char *name)
1057 {
1058 struct ip_conntrack_helper *h;
1059
1060 list_for_each_entry(h, &helpers, list) {
1061 if (!strcmp(h->name, name))
1062 return h;
1063 }
1064
1065 return NULL;
1066 }
1067
1068 static inline void unhelp(struct ip_conntrack_tuple_hash *i,
1069 const struct ip_conntrack_helper *me)
1070 {
1071 if (tuplehash_to_ctrack(i)->helper == me) {
1072 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1073 tuplehash_to_ctrack(i)->helper = NULL;
1074 }
1075 }
1076
1077 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1078 {
1079 unsigned int i;
1080 struct ip_conntrack_tuple_hash *h;
1081 struct ip_conntrack_expect *exp, *tmp;
1082
1083 /* Need write lock here, to delete helper. */
1084 write_lock_bh(&ip_conntrack_lock);
1085 list_del(&me->list);
1086
1087 /* Get rid of expectations */
1088 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1089 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1090 ip_ct_unlink_expect(exp);
1091 ip_conntrack_expect_put(exp);
1092 }
1093 }
1094 /* Get rid of expecteds, set helpers to NULL. */
1095 list_for_each_entry(h, &unconfirmed, list)
1096 unhelp(h, me);
1097 for (i = 0; i < ip_conntrack_htable_size; i++) {
1098 list_for_each_entry(h, &ip_conntrack_hash[i], list)
1099 unhelp(h, me);
1100 }
1101 write_unlock_bh(&ip_conntrack_lock);
1102
1103 /* Someone could be still looking at the helper in a bh. */
1104 synchronize_net();
1105 }
1106
1107 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1108 void __ip_ct_refresh_acct(struct ip_conntrack *ct,
1109 enum ip_conntrack_info ctinfo,
1110 const struct sk_buff *skb,
1111 unsigned long extra_jiffies,
1112 int do_acct)
1113 {
1114 int event = 0;
1115
1116 IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1117 IP_NF_ASSERT(skb);
1118
1119 write_lock_bh(&ip_conntrack_lock);
1120
1121 /* Only update if this is not a fixed timeout */
1122 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1123 write_unlock_bh(&ip_conntrack_lock);
1124 return;
1125 }
1126
1127 /* If not in hash table, timer will not be active yet */
1128 if (!is_confirmed(ct)) {
1129 ct->timeout.expires = extra_jiffies;
1130 event = IPCT_REFRESH;
1131 } else {
1132 /* Need del_timer for race avoidance (may already be dying). */
1133 if (del_timer(&ct->timeout)) {
1134 ct->timeout.expires = jiffies + extra_jiffies;
1135 add_timer(&ct->timeout);
1136 event = IPCT_REFRESH;
1137 }
1138 }
1139
1140 #ifdef CONFIG_IP_NF_CT_ACCT
1141 if (do_acct) {
1142 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1143 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1144 ntohs(skb->nh.iph->tot_len);
1145 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1146 || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1147 event |= IPCT_COUNTER_FILLING;
1148 }
1149 #endif
1150
1151 write_unlock_bh(&ip_conntrack_lock);
1152
1153 /* must be unlocked when calling event cache */
1154 if (event)
1155 ip_conntrack_event_cache(event, skb);
1156 }
1157
1158 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1159 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1160 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1161 * in ip_conntrack_core, since we don't want the protocols to autoload
1162 * or depend on ctnetlink */
1163 int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1164 const struct ip_conntrack_tuple *tuple)
1165 {
1166 NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(__be16),
1167 &tuple->src.u.tcp.port);
1168 NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(__be16),
1169 &tuple->dst.u.tcp.port);
1170 return 0;
1171
1172 nfattr_failure:
1173 return -1;
1174 }
1175
1176 int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1177 struct ip_conntrack_tuple *t)
1178 {
1179 if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1180 return -EINVAL;
1181
1182 t->src.u.tcp.port =
1183 *(__be16 *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1184 t->dst.u.tcp.port =
1185 *(__be16 *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1186
1187 return 0;
1188 }
1189 #endif
1190
1191 /* Returns new sk_buff, or NULL */
1192 struct sk_buff *
1193 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1194 {
1195 skb_orphan(skb);
1196
1197 local_bh_disable();
1198 skb = ip_defrag(skb, user);
1199 local_bh_enable();
1200
1201 if (skb)
1202 ip_send_check(skb->nh.iph);
1203 return skb;
1204 }
1205
1206 /* Used by ipt_REJECT. */
1207 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1208 {
1209 struct ip_conntrack *ct;
1210 enum ip_conntrack_info ctinfo;
1211
1212 /* This ICMP is in reverse direction to the packet which caused it */
1213 ct = ip_conntrack_get(skb, &ctinfo);
1214
1215 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1216 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1217 else
1218 ctinfo = IP_CT_RELATED;
1219
1220 /* Attach to new skbuff, and increment count */
1221 nskb->nfct = &ct->ct_general;
1222 nskb->nfctinfo = ctinfo;
1223 nf_conntrack_get(nskb->nfct);
1224 }
1225
1226 /* Bring out ya dead! */
1227 static struct ip_conntrack *
1228 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1229 void *data, unsigned int *bucket)
1230 {
1231 struct ip_conntrack_tuple_hash *h;
1232 struct ip_conntrack *ct;
1233
1234 write_lock_bh(&ip_conntrack_lock);
1235 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1236 list_for_each_entry(h, &ip_conntrack_hash[*bucket], list) {
1237 ct = tuplehash_to_ctrack(h);
1238 if (iter(ct, data))
1239 goto found;
1240 }
1241 }
1242 list_for_each_entry(h, &unconfirmed, list) {
1243 ct = tuplehash_to_ctrack(h);
1244 if (iter(ct, data))
1245 goto found;
1246 }
1247 write_unlock_bh(&ip_conntrack_lock);
1248 return NULL;
1249
1250 found:
1251 atomic_inc(&ct->ct_general.use);
1252 write_unlock_bh(&ip_conntrack_lock);
1253 return ct;
1254 }
1255
1256 void
1257 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1258 {
1259 struct ip_conntrack *ct;
1260 unsigned int bucket = 0;
1261
1262 while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
1263 /* Time to push up daises... */
1264 if (del_timer(&ct->timeout))
1265 death_by_timeout((unsigned long)ct);
1266 /* ... else the timer will get him soon. */
1267
1268 ip_conntrack_put(ct);
1269 }
1270 }
1271
1272 /* Fast function for those who don't want to parse /proc (and I don't
1273 blame them). */
1274 /* Reversing the socket's dst/src point of view gives us the reply
1275 mapping. */
1276 static int
1277 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1278 {
1279 struct inet_sock *inet = inet_sk(sk);
1280 struct ip_conntrack_tuple_hash *h;
1281 struct ip_conntrack_tuple tuple;
1282
1283 IP_CT_TUPLE_U_BLANK(&tuple);
1284 tuple.src.ip = inet->rcv_saddr;
1285 tuple.src.u.tcp.port = inet->sport;
1286 tuple.dst.ip = inet->daddr;
1287 tuple.dst.u.tcp.port = inet->dport;
1288 tuple.dst.protonum = IPPROTO_TCP;
1289
1290 /* We only do TCP at the moment: is there a better way? */
1291 if (strcmp(sk->sk_prot->name, "TCP")) {
1292 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1293 return -ENOPROTOOPT;
1294 }
1295
1296 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1297 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1298 *len, sizeof(struct sockaddr_in));
1299 return -EINVAL;
1300 }
1301
1302 h = ip_conntrack_find_get(&tuple, NULL);
1303 if (h) {
1304 struct sockaddr_in sin;
1305 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1306
1307 sin.sin_family = AF_INET;
1308 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1309 .tuple.dst.u.tcp.port;
1310 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1311 .tuple.dst.ip;
1312 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
1313
1314 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1315 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1316 ip_conntrack_put(ct);
1317 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1318 return -EFAULT;
1319 else
1320 return 0;
1321 }
1322 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1323 NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1324 NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1325 return -ENOENT;
1326 }
1327
1328 static struct nf_sockopt_ops so_getorigdst = {
1329 .pf = PF_INET,
1330 .get_optmin = SO_ORIGINAL_DST,
1331 .get_optmax = SO_ORIGINAL_DST+1,
1332 .get = &getorigdst,
1333 };
1334
1335 static int kill_all(struct ip_conntrack *i, void *data)
1336 {
1337 return 1;
1338 }
1339
1340 void ip_conntrack_flush(void)
1341 {
1342 ip_ct_iterate_cleanup(kill_all, NULL);
1343 }
1344
1345 static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
1346 {
1347 if (vmalloced)
1348 vfree(hash);
1349 else
1350 free_pages((unsigned long)hash,
1351 get_order(sizeof(struct list_head) * size));
1352 }
1353
1354 /* Mishearing the voices in his head, our hero wonders how he's
1355 supposed to kill the mall. */
1356 void ip_conntrack_cleanup(void)
1357 {
1358 ip_ct_attach = NULL;
1359
1360 /* This makes sure all current packets have passed through
1361 netfilter framework. Roll on, two-stage module
1362 delete... */
1363 synchronize_net();
1364
1365 ip_ct_event_cache_flush();
1366 i_see_dead_people:
1367 ip_conntrack_flush();
1368 if (atomic_read(&ip_conntrack_count) != 0) {
1369 schedule();
1370 goto i_see_dead_people;
1371 }
1372 /* wait until all references to ip_conntrack_untracked are dropped */
1373 while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1374 schedule();
1375
1376 kmem_cache_destroy(ip_conntrack_cachep);
1377 kmem_cache_destroy(ip_conntrack_expect_cachep);
1378 free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1379 ip_conntrack_htable_size);
1380 nf_unregister_sockopt(&so_getorigdst);
1381 }
1382
1383 static struct list_head *alloc_hashtable(int size, int *vmalloced)
1384 {
1385 struct list_head *hash;
1386 unsigned int i;
1387
1388 *vmalloced = 0;
1389 hash = (void*)__get_free_pages(GFP_KERNEL,
1390 get_order(sizeof(struct list_head)
1391 * size));
1392 if (!hash) {
1393 *vmalloced = 1;
1394 printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
1395 hash = vmalloc(sizeof(struct list_head) * size);
1396 }
1397
1398 if (hash)
1399 for (i = 0; i < size; i++)
1400 INIT_LIST_HEAD(&hash[i]);
1401
1402 return hash;
1403 }
1404
1405 static int set_hashsize(const char *val, struct kernel_param *kp)
1406 {
1407 int i, bucket, hashsize, vmalloced;
1408 int old_vmalloced, old_size;
1409 int rnd;
1410 struct list_head *hash, *old_hash;
1411 struct ip_conntrack_tuple_hash *h;
1412
1413 /* On boot, we can set this without any fancy locking. */
1414 if (!ip_conntrack_htable_size)
1415 return param_set_int(val, kp);
1416
1417 hashsize = simple_strtol(val, NULL, 0);
1418 if (!hashsize)
1419 return -EINVAL;
1420
1421 hash = alloc_hashtable(hashsize, &vmalloced);
1422 if (!hash)
1423 return -ENOMEM;
1424
1425 /* We have to rehash for the new table anyway, so we also can
1426 * use a new random seed */
1427 get_random_bytes(&rnd, 4);
1428
1429 write_lock_bh(&ip_conntrack_lock);
1430 for (i = 0; i < ip_conntrack_htable_size; i++) {
1431 while (!list_empty(&ip_conntrack_hash[i])) {
1432 h = list_entry(ip_conntrack_hash[i].next,
1433 struct ip_conntrack_tuple_hash, list);
1434 list_del(&h->list);
1435 bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1436 list_add_tail(&h->list, &hash[bucket]);
1437 }
1438 }
1439 old_size = ip_conntrack_htable_size;
1440 old_vmalloced = ip_conntrack_vmalloc;
1441 old_hash = ip_conntrack_hash;
1442
1443 ip_conntrack_htable_size = hashsize;
1444 ip_conntrack_vmalloc = vmalloced;
1445 ip_conntrack_hash = hash;
1446 ip_conntrack_hash_rnd = rnd;
1447 write_unlock_bh(&ip_conntrack_lock);
1448
1449 free_conntrack_hash(old_hash, old_vmalloced, old_size);
1450 return 0;
1451 }
1452
1453 module_param_call(hashsize, set_hashsize, param_get_uint,
1454 &ip_conntrack_htable_size, 0600);
1455
1456 int __init ip_conntrack_init(void)
1457 {
1458 unsigned int i;
1459 int ret;
1460
1461 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1462 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1463 if (!ip_conntrack_htable_size) {
1464 ip_conntrack_htable_size
1465 = (((num_physpages << PAGE_SHIFT) / 16384)
1466 / sizeof(struct list_head));
1467 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1468 ip_conntrack_htable_size = 8192;
1469 if (ip_conntrack_htable_size < 16)
1470 ip_conntrack_htable_size = 16;
1471 }
1472 ip_conntrack_max = 8 * ip_conntrack_htable_size;
1473
1474 printk("ip_conntrack version %s (%u buckets, %d max)"
1475 " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1476 ip_conntrack_htable_size, ip_conntrack_max,
1477 sizeof(struct ip_conntrack));
1478
1479 ret = nf_register_sockopt(&so_getorigdst);
1480 if (ret != 0) {
1481 printk(KERN_ERR "Unable to register netfilter socket option\n");
1482 return ret;
1483 }
1484
1485 ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
1486 &ip_conntrack_vmalloc);
1487 if (!ip_conntrack_hash) {
1488 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1489 goto err_unreg_sockopt;
1490 }
1491
1492 ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1493 sizeof(struct ip_conntrack), 0,
1494 0, NULL, NULL);
1495 if (!ip_conntrack_cachep) {
1496 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1497 goto err_free_hash;
1498 }
1499
1500 ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1501 sizeof(struct ip_conntrack_expect),
1502 0, 0, NULL, NULL);
1503 if (!ip_conntrack_expect_cachep) {
1504 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1505 goto err_free_conntrack_slab;
1506 }
1507
1508 /* Don't NEED lock here, but good form anyway. */
1509 write_lock_bh(&ip_conntrack_lock);
1510 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1511 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1512 /* Sew in builtin protocols. */
1513 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1514 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1515 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1516 write_unlock_bh(&ip_conntrack_lock);
1517
1518 /* For use by ipt_REJECT */
1519 ip_ct_attach = ip_conntrack_attach;
1520
1521 /* Set up fake conntrack:
1522 - to never be deleted, not in any hashes */
1523 atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1524 /* - and look it like as a confirmed connection */
1525 set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1526
1527 return ret;
1528
1529 err_free_conntrack_slab:
1530 kmem_cache_destroy(ip_conntrack_cachep);
1531 err_free_hash:
1532 free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1533 ip_conntrack_htable_size);
1534 err_unreg_sockopt:
1535 nf_unregister_sockopt(&so_getorigdst);
1536
1537 return -ENOMEM;
1538 }