]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blob - net/ipv4/netfilter/ip_conntrack_core.c
[NETFILTER]: conntrack: fix race condition in early_drop
[mirror_ubuntu-zesty-kernel.git] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 *
12 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13 * - new API and handling of conntrack/nat helpers
14 * - now capable of multiple expectations for one master
15 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16 * - add usage/reference counts to ip_conntrack_expect
17 * - export ip_conntrack[_expect]_{find_get,put} functions
18 * */
19
20 #include <linux/types.h>
21 #include <linux/icmp.h>
22 #include <linux/ip.h>
23 #include <linux/netfilter.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/module.h>
26 #include <linux/skbuff.h>
27 #include <linux/proc_fs.h>
28 #include <linux/vmalloc.h>
29 #include <net/checksum.h>
30 #include <net/ip.h>
31 #include <linux/stddef.h>
32 #include <linux/sysctl.h>
33 #include <linux/slab.h>
34 #include <linux/random.h>
35 #include <linux/jhash.h>
36 #include <linux/err.h>
37 #include <linux/percpu.h>
38 #include <linux/moduleparam.h>
39 #include <linux/notifier.h>
40
41 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
42 registrations, conntrack timers*/
43 #define ASSERT_READ_LOCK(x)
44 #define ASSERT_WRITE_LOCK(x)
45
46 #include <linux/netfilter_ipv4/ip_conntrack.h>
47 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
50
51 #define IP_CONNTRACK_VERSION "2.4"
52
53 #if 0
54 #define DEBUGP printk
55 #else
56 #define DEBUGP(format, args...)
57 #endif
58
59 DEFINE_RWLOCK(ip_conntrack_lock);
60
61 /* ip_conntrack_standalone needs this */
62 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
63
64 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
65 LIST_HEAD(ip_conntrack_expect_list);
66 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
67 static LIST_HEAD(helpers);
68 unsigned int ip_conntrack_htable_size __read_mostly = 0;
69 int ip_conntrack_max __read_mostly;
70 struct list_head *ip_conntrack_hash;
71 static kmem_cache_t *ip_conntrack_cachep __read_mostly;
72 static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
73 struct ip_conntrack ip_conntrack_untracked;
74 unsigned int ip_ct_log_invalid __read_mostly;
75 static LIST_HEAD(unconfirmed);
76 static int ip_conntrack_vmalloc;
77
78 static unsigned int ip_conntrack_next_id;
79 static unsigned int ip_conntrack_expect_next_id;
80 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
81 ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
82 ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
83
84 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
85
86 /* deliver cached events and clear cache entry - must be called with locally
87 * disabled softirqs */
88 static inline void
89 __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
90 {
91 DEBUGP("ecache: delivering events for %p\n", ecache->ct);
92 if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
93 atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events,
94 ecache->ct);
95 ecache->events = 0;
96 ip_conntrack_put(ecache->ct);
97 ecache->ct = NULL;
98 }
99
100 /* Deliver all cached events for a particular conntrack. This is called
101 * by code prior to async packet handling or freeing the skb */
102 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
103 {
104 struct ip_conntrack_ecache *ecache;
105
106 local_bh_disable();
107 ecache = &__get_cpu_var(ip_conntrack_ecache);
108 if (ecache->ct == ct)
109 __ip_ct_deliver_cached_events(ecache);
110 local_bh_enable();
111 }
112
113 void __ip_ct_event_cache_init(struct ip_conntrack *ct)
114 {
115 struct ip_conntrack_ecache *ecache;
116
117 /* take care of delivering potentially old events */
118 ecache = &__get_cpu_var(ip_conntrack_ecache);
119 BUG_ON(ecache->ct == ct);
120 if (ecache->ct)
121 __ip_ct_deliver_cached_events(ecache);
122 /* initialize for this conntrack/packet */
123 ecache->ct = ct;
124 nf_conntrack_get(&ct->ct_general);
125 }
126
127 /* flush the event cache - touches other CPU's data and must not be called while
128 * packets are still passing through the code */
129 static void ip_ct_event_cache_flush(void)
130 {
131 struct ip_conntrack_ecache *ecache;
132 int cpu;
133
134 for_each_possible_cpu(cpu) {
135 ecache = &per_cpu(ip_conntrack_ecache, cpu);
136 if (ecache->ct)
137 ip_conntrack_put(ecache->ct);
138 }
139 }
140 #else
141 static inline void ip_ct_event_cache_flush(void) {}
142 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
143
144 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
145
146 static int ip_conntrack_hash_rnd_initted;
147 static unsigned int ip_conntrack_hash_rnd;
148
149 static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
150 unsigned int size, unsigned int rnd)
151 {
152 return (jhash_3words(tuple->src.ip,
153 (tuple->dst.ip ^ tuple->dst.protonum),
154 (tuple->src.u.all | (tuple->dst.u.all << 16)),
155 rnd) % size);
156 }
157
158 static u_int32_t
159 hash_conntrack(const struct ip_conntrack_tuple *tuple)
160 {
161 return __hash_conntrack(tuple, ip_conntrack_htable_size,
162 ip_conntrack_hash_rnd);
163 }
164
165 int
166 ip_ct_get_tuple(const struct iphdr *iph,
167 const struct sk_buff *skb,
168 unsigned int dataoff,
169 struct ip_conntrack_tuple *tuple,
170 const struct ip_conntrack_protocol *protocol)
171 {
172 /* Never happen */
173 if (iph->frag_off & htons(IP_OFFSET)) {
174 printk("ip_conntrack_core: Frag of proto %u.\n",
175 iph->protocol);
176 return 0;
177 }
178
179 tuple->src.ip = iph->saddr;
180 tuple->dst.ip = iph->daddr;
181 tuple->dst.protonum = iph->protocol;
182 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
183
184 return protocol->pkt_to_tuple(skb, dataoff, tuple);
185 }
186
187 int
188 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
189 const struct ip_conntrack_tuple *orig,
190 const struct ip_conntrack_protocol *protocol)
191 {
192 inverse->src.ip = orig->dst.ip;
193 inverse->dst.ip = orig->src.ip;
194 inverse->dst.protonum = orig->dst.protonum;
195 inverse->dst.dir = !orig->dst.dir;
196
197 return protocol->invert_tuple(inverse, orig);
198 }
199
200
201 /* ip_conntrack_expect helper functions */
202 void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
203 {
204 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
205 IP_NF_ASSERT(!timer_pending(&exp->timeout));
206 list_del(&exp->list);
207 CONNTRACK_STAT_INC(expect_delete);
208 exp->master->expecting--;
209 ip_conntrack_expect_put(exp);
210 }
211
212 static void expectation_timed_out(unsigned long ul_expect)
213 {
214 struct ip_conntrack_expect *exp = (void *)ul_expect;
215
216 write_lock_bh(&ip_conntrack_lock);
217 ip_ct_unlink_expect(exp);
218 write_unlock_bh(&ip_conntrack_lock);
219 ip_conntrack_expect_put(exp);
220 }
221
222 struct ip_conntrack_expect *
223 __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
224 {
225 struct ip_conntrack_expect *i;
226
227 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
228 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
229 atomic_inc(&i->use);
230 return i;
231 }
232 }
233 return NULL;
234 }
235
236 /* Just find a expectation corresponding to a tuple. */
237 struct ip_conntrack_expect *
238 ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
239 {
240 struct ip_conntrack_expect *i;
241
242 read_lock_bh(&ip_conntrack_lock);
243 i = __ip_conntrack_expect_find(tuple);
244 read_unlock_bh(&ip_conntrack_lock);
245
246 return i;
247 }
248
249 /* If an expectation for this connection is found, it gets delete from
250 * global list then returned. */
251 static struct ip_conntrack_expect *
252 find_expectation(const struct ip_conntrack_tuple *tuple)
253 {
254 struct ip_conntrack_expect *i;
255
256 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
257 /* If master is not in hash table yet (ie. packet hasn't left
258 this machine yet), how can other end know about expected?
259 Hence these are not the droids you are looking for (if
260 master ct never got confirmed, we'd hold a reference to it
261 and weird things would happen to future packets). */
262 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
263 && is_confirmed(i->master)) {
264 if (i->flags & IP_CT_EXPECT_PERMANENT) {
265 atomic_inc(&i->use);
266 return i;
267 } else if (del_timer(&i->timeout)) {
268 ip_ct_unlink_expect(i);
269 return i;
270 }
271 }
272 }
273 return NULL;
274 }
275
276 /* delete all expectations for this conntrack */
277 void ip_ct_remove_expectations(struct ip_conntrack *ct)
278 {
279 struct ip_conntrack_expect *i, *tmp;
280
281 /* Optimization: most connection never expect any others. */
282 if (ct->expecting == 0)
283 return;
284
285 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
286 if (i->master == ct && del_timer(&i->timeout)) {
287 ip_ct_unlink_expect(i);
288 ip_conntrack_expect_put(i);
289 }
290 }
291 }
292
293 static void
294 clean_from_lists(struct ip_conntrack *ct)
295 {
296 DEBUGP("clean_from_lists(%p)\n", ct);
297 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
298 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
299 list_del(&ct->tuplehash[IP_CT_DIR_REPLY].list);
300
301 /* Destroy all pending expectations */
302 ip_ct_remove_expectations(ct);
303 }
304
305 static void
306 destroy_conntrack(struct nf_conntrack *nfct)
307 {
308 struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
309 struct ip_conntrack_protocol *proto;
310
311 DEBUGP("destroy_conntrack(%p)\n", ct);
312 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
313 IP_NF_ASSERT(!timer_pending(&ct->timeout));
314
315 ip_conntrack_event(IPCT_DESTROY, ct);
316 set_bit(IPS_DYING_BIT, &ct->status);
317
318 /* To make sure we don't get any weird locking issues here:
319 * destroy_conntrack() MUST NOT be called with a write lock
320 * to ip_conntrack_lock!!! -HW */
321 proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
322 if (proto && proto->destroy)
323 proto->destroy(ct);
324
325 if (ip_conntrack_destroyed)
326 ip_conntrack_destroyed(ct);
327
328 write_lock_bh(&ip_conntrack_lock);
329 /* Expectations will have been removed in clean_from_lists,
330 * except TFTP can create an expectation on the first packet,
331 * before connection is in the list, so we need to clean here,
332 * too. */
333 ip_ct_remove_expectations(ct);
334
335 /* We overload first tuple to link into unconfirmed list. */
336 if (!is_confirmed(ct)) {
337 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
338 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
339 }
340
341 CONNTRACK_STAT_INC(delete);
342 write_unlock_bh(&ip_conntrack_lock);
343
344 if (ct->master)
345 ip_conntrack_put(ct->master);
346
347 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
348 ip_conntrack_free(ct);
349 }
350
351 static void death_by_timeout(unsigned long ul_conntrack)
352 {
353 struct ip_conntrack *ct = (void *)ul_conntrack;
354
355 write_lock_bh(&ip_conntrack_lock);
356 /* Inside lock so preempt is disabled on module removal path.
357 * Otherwise we can get spurious warnings. */
358 CONNTRACK_STAT_INC(delete_list);
359 clean_from_lists(ct);
360 write_unlock_bh(&ip_conntrack_lock);
361 ip_conntrack_put(ct);
362 }
363
364 struct ip_conntrack_tuple_hash *
365 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
366 const struct ip_conntrack *ignored_conntrack)
367 {
368 struct ip_conntrack_tuple_hash *h;
369 unsigned int hash = hash_conntrack(tuple);
370
371 ASSERT_READ_LOCK(&ip_conntrack_lock);
372 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
373 if (tuplehash_to_ctrack(h) != ignored_conntrack &&
374 ip_ct_tuple_equal(tuple, &h->tuple)) {
375 CONNTRACK_STAT_INC(found);
376 return h;
377 }
378 CONNTRACK_STAT_INC(searched);
379 }
380
381 return NULL;
382 }
383
384 /* Find a connection corresponding to a tuple. */
385 struct ip_conntrack_tuple_hash *
386 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
387 const struct ip_conntrack *ignored_conntrack)
388 {
389 struct ip_conntrack_tuple_hash *h;
390
391 read_lock_bh(&ip_conntrack_lock);
392 h = __ip_conntrack_find(tuple, ignored_conntrack);
393 if (h)
394 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
395 read_unlock_bh(&ip_conntrack_lock);
396
397 return h;
398 }
399
400 static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
401 unsigned int hash,
402 unsigned int repl_hash)
403 {
404 ct->id = ++ip_conntrack_next_id;
405 list_add(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list,
406 &ip_conntrack_hash[hash]);
407 list_add(&ct->tuplehash[IP_CT_DIR_REPLY].list,
408 &ip_conntrack_hash[repl_hash]);
409 }
410
411 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
412 {
413 unsigned int hash, repl_hash;
414
415 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
416 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
417
418 write_lock_bh(&ip_conntrack_lock);
419 __ip_conntrack_hash_insert(ct, hash, repl_hash);
420 write_unlock_bh(&ip_conntrack_lock);
421 }
422
423 /* Confirm a connection given skb; places it in hash table */
424 int
425 __ip_conntrack_confirm(struct sk_buff **pskb)
426 {
427 unsigned int hash, repl_hash;
428 struct ip_conntrack_tuple_hash *h;
429 struct ip_conntrack *ct;
430 enum ip_conntrack_info ctinfo;
431
432 ct = ip_conntrack_get(*pskb, &ctinfo);
433
434 /* ipt_REJECT uses ip_conntrack_attach to attach related
435 ICMP/TCP RST packets in other direction. Actual packet
436 which created connection will be IP_CT_NEW or for an
437 expected connection, IP_CT_RELATED. */
438 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
439 return NF_ACCEPT;
440
441 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
442 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
443
444 /* We're not in hash table, and we refuse to set up related
445 connections for unconfirmed conns. But packet copies and
446 REJECT will give spurious warnings here. */
447 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
448
449 /* No external references means noone else could have
450 confirmed us. */
451 IP_NF_ASSERT(!is_confirmed(ct));
452 DEBUGP("Confirming conntrack %p\n", ct);
453
454 write_lock_bh(&ip_conntrack_lock);
455
456 /* See if there's one in the list already, including reverse:
457 NAT could have grabbed it without realizing, since we're
458 not in the hash. If there is, we lost race. */
459 list_for_each_entry(h, &ip_conntrack_hash[hash], list)
460 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
461 &h->tuple))
462 goto out;
463 list_for_each_entry(h, &ip_conntrack_hash[repl_hash], list)
464 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
465 &h->tuple))
466 goto out;
467
468 /* Remove from unconfirmed list */
469 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
470
471 __ip_conntrack_hash_insert(ct, hash, repl_hash);
472 /* Timer relative to confirmation time, not original
473 setting time, otherwise we'd get timer wrap in
474 weird delay cases. */
475 ct->timeout.expires += jiffies;
476 add_timer(&ct->timeout);
477 atomic_inc(&ct->ct_general.use);
478 set_bit(IPS_CONFIRMED_BIT, &ct->status);
479 CONNTRACK_STAT_INC(insert);
480 write_unlock_bh(&ip_conntrack_lock);
481 if (ct->helper)
482 ip_conntrack_event_cache(IPCT_HELPER, *pskb);
483 #ifdef CONFIG_IP_NF_NAT_NEEDED
484 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
485 test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
486 ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
487 #endif
488 ip_conntrack_event_cache(master_ct(ct) ?
489 IPCT_RELATED : IPCT_NEW, *pskb);
490
491 return NF_ACCEPT;
492
493 out:
494 CONNTRACK_STAT_INC(insert_failed);
495 write_unlock_bh(&ip_conntrack_lock);
496 return NF_DROP;
497 }
498
499 /* Returns true if a connection correspondings to the tuple (required
500 for NAT). */
501 int
502 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
503 const struct ip_conntrack *ignored_conntrack)
504 {
505 struct ip_conntrack_tuple_hash *h;
506
507 read_lock_bh(&ip_conntrack_lock);
508 h = __ip_conntrack_find(tuple, ignored_conntrack);
509 read_unlock_bh(&ip_conntrack_lock);
510
511 return h != NULL;
512 }
513
514 /* There's a small race here where we may free a just-assured
515 connection. Too bad: we're in trouble anyway. */
516 static int early_drop(struct list_head *chain)
517 {
518 /* Traverse backwards: gives us oldest, which is roughly LRU */
519 struct ip_conntrack_tuple_hash *h;
520 struct ip_conntrack *ct = NULL, *tmp;
521 int dropped = 0;
522
523 read_lock_bh(&ip_conntrack_lock);
524 list_for_each_entry_reverse(h, chain, list) {
525 tmp = tuplehash_to_ctrack(h);
526 if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) {
527 ct = tmp;
528 atomic_inc(&ct->ct_general.use);
529 break;
530 }
531 }
532 read_unlock_bh(&ip_conntrack_lock);
533
534 if (!ct)
535 return dropped;
536
537 if (del_timer(&ct->timeout)) {
538 death_by_timeout((unsigned long)ct);
539 dropped = 1;
540 CONNTRACK_STAT_INC(early_drop);
541 }
542 ip_conntrack_put(ct);
543 return dropped;
544 }
545
546 static struct ip_conntrack_helper *
547 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
548 {
549 struct ip_conntrack_helper *h;
550
551 list_for_each_entry(h, &helpers, list) {
552 if (ip_ct_tuple_mask_cmp(tuple, &h->tuple, &h->mask))
553 return h;
554 }
555 return NULL;
556 }
557
558 struct ip_conntrack_helper *
559 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
560 {
561 struct ip_conntrack_helper *helper;
562
563 /* need ip_conntrack_lock to assure that helper exists until
564 * try_module_get() is called */
565 read_lock_bh(&ip_conntrack_lock);
566
567 helper = __ip_conntrack_helper_find(tuple);
568 if (helper) {
569 /* need to increase module usage count to assure helper will
570 * not go away while the caller is e.g. busy putting a
571 * conntrack in the hash that uses the helper */
572 if (!try_module_get(helper->me))
573 helper = NULL;
574 }
575
576 read_unlock_bh(&ip_conntrack_lock);
577
578 return helper;
579 }
580
581 void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
582 {
583 module_put(helper->me);
584 }
585
586 struct ip_conntrack_protocol *
587 __ip_conntrack_proto_find(u_int8_t protocol)
588 {
589 return ip_ct_protos[protocol];
590 }
591
592 /* this is guaranteed to always return a valid protocol helper, since
593 * it falls back to generic_protocol */
594 struct ip_conntrack_protocol *
595 ip_conntrack_proto_find_get(u_int8_t protocol)
596 {
597 struct ip_conntrack_protocol *p;
598
599 preempt_disable();
600 p = __ip_conntrack_proto_find(protocol);
601 if (p) {
602 if (!try_module_get(p->me))
603 p = &ip_conntrack_generic_protocol;
604 }
605 preempt_enable();
606
607 return p;
608 }
609
610 void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
611 {
612 module_put(p->me);
613 }
614
615 struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
616 struct ip_conntrack_tuple *repl)
617 {
618 struct ip_conntrack *conntrack;
619
620 if (!ip_conntrack_hash_rnd_initted) {
621 get_random_bytes(&ip_conntrack_hash_rnd, 4);
622 ip_conntrack_hash_rnd_initted = 1;
623 }
624
625 /* We don't want any race condition at early drop stage */
626 atomic_inc(&ip_conntrack_count);
627
628 if (ip_conntrack_max
629 && atomic_read(&ip_conntrack_count) > ip_conntrack_max) {
630 unsigned int hash = hash_conntrack(orig);
631 /* Try dropping from this hash chain. */
632 if (!early_drop(&ip_conntrack_hash[hash])) {
633 atomic_dec(&ip_conntrack_count);
634 if (net_ratelimit())
635 printk(KERN_WARNING
636 "ip_conntrack: table full, dropping"
637 " packet.\n");
638 return ERR_PTR(-ENOMEM);
639 }
640 }
641
642 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
643 if (!conntrack) {
644 DEBUGP("Can't allocate conntrack.\n");
645 atomic_dec(&ip_conntrack_count);
646 return ERR_PTR(-ENOMEM);
647 }
648
649 memset(conntrack, 0, sizeof(*conntrack));
650 atomic_set(&conntrack->ct_general.use, 1);
651 conntrack->ct_general.destroy = destroy_conntrack;
652 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
653 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
654 /* Don't set timer yet: wait for confirmation */
655 init_timer(&conntrack->timeout);
656 conntrack->timeout.data = (unsigned long)conntrack;
657 conntrack->timeout.function = death_by_timeout;
658
659 return conntrack;
660 }
661
662 void
663 ip_conntrack_free(struct ip_conntrack *conntrack)
664 {
665 atomic_dec(&ip_conntrack_count);
666 kmem_cache_free(ip_conntrack_cachep, conntrack);
667 }
668
669 /* Allocate a new conntrack: we return -ENOMEM if classification
670 * failed due to stress. Otherwise it really is unclassifiable */
671 static struct ip_conntrack_tuple_hash *
672 init_conntrack(struct ip_conntrack_tuple *tuple,
673 struct ip_conntrack_protocol *protocol,
674 struct sk_buff *skb)
675 {
676 struct ip_conntrack *conntrack;
677 struct ip_conntrack_tuple repl_tuple;
678 struct ip_conntrack_expect *exp;
679
680 if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
681 DEBUGP("Can't invert tuple.\n");
682 return NULL;
683 }
684
685 conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
686 if (conntrack == NULL || IS_ERR(conntrack))
687 return (struct ip_conntrack_tuple_hash *)conntrack;
688
689 if (!protocol->new(conntrack, skb)) {
690 ip_conntrack_free(conntrack);
691 return NULL;
692 }
693
694 write_lock_bh(&ip_conntrack_lock);
695 exp = find_expectation(tuple);
696
697 if (exp) {
698 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
699 conntrack, exp);
700 /* Welcome, Mr. Bond. We've been expecting you... */
701 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
702 conntrack->master = exp->master;
703 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
704 conntrack->mark = exp->master->mark;
705 #endif
706 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
707 defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
708 /* this is ugly, but there is no other place where to put it */
709 conntrack->nat.masq_index = exp->master->nat.masq_index;
710 #endif
711 #ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
712 conntrack->secmark = exp->master->secmark;
713 #endif
714 nf_conntrack_get(&conntrack->master->ct_general);
715 CONNTRACK_STAT_INC(expect_new);
716 } else {
717 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
718
719 CONNTRACK_STAT_INC(new);
720 }
721
722 /* Overload tuple linked list to put us in unconfirmed list. */
723 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
724
725 write_unlock_bh(&ip_conntrack_lock);
726
727 if (exp) {
728 if (exp->expectfn)
729 exp->expectfn(conntrack, exp);
730 ip_conntrack_expect_put(exp);
731 }
732
733 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
734 }
735
736 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
737 static inline struct ip_conntrack *
738 resolve_normal_ct(struct sk_buff *skb,
739 struct ip_conntrack_protocol *proto,
740 int *set_reply,
741 unsigned int hooknum,
742 enum ip_conntrack_info *ctinfo)
743 {
744 struct ip_conntrack_tuple tuple;
745 struct ip_conntrack_tuple_hash *h;
746 struct ip_conntrack *ct;
747
748 IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
749
750 if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
751 &tuple,proto))
752 return NULL;
753
754 /* look for tuple match */
755 h = ip_conntrack_find_get(&tuple, NULL);
756 if (!h) {
757 h = init_conntrack(&tuple, proto, skb);
758 if (!h)
759 return NULL;
760 if (IS_ERR(h))
761 return (void *)h;
762 }
763 ct = tuplehash_to_ctrack(h);
764
765 /* It exists; we have (non-exclusive) reference. */
766 if (DIRECTION(h) == IP_CT_DIR_REPLY) {
767 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
768 /* Please set reply bit if this packet OK */
769 *set_reply = 1;
770 } else {
771 /* Once we've had two way comms, always ESTABLISHED. */
772 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
773 DEBUGP("ip_conntrack_in: normal packet for %p\n",
774 ct);
775 *ctinfo = IP_CT_ESTABLISHED;
776 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
777 DEBUGP("ip_conntrack_in: related packet for %p\n",
778 ct);
779 *ctinfo = IP_CT_RELATED;
780 } else {
781 DEBUGP("ip_conntrack_in: new packet for %p\n",
782 ct);
783 *ctinfo = IP_CT_NEW;
784 }
785 *set_reply = 0;
786 }
787 skb->nfct = &ct->ct_general;
788 skb->nfctinfo = *ctinfo;
789 return ct;
790 }
791
792 /* Netfilter hook itself. */
793 unsigned int ip_conntrack_in(unsigned int hooknum,
794 struct sk_buff **pskb,
795 const struct net_device *in,
796 const struct net_device *out,
797 int (*okfn)(struct sk_buff *))
798 {
799 struct ip_conntrack *ct;
800 enum ip_conntrack_info ctinfo;
801 struct ip_conntrack_protocol *proto;
802 int set_reply = 0;
803 int ret;
804
805 /* Previously seen (loopback or untracked)? Ignore. */
806 if ((*pskb)->nfct) {
807 CONNTRACK_STAT_INC(ignore);
808 return NF_ACCEPT;
809 }
810
811 /* Never happen */
812 if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
813 if (net_ratelimit()) {
814 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
815 (*pskb)->nh.iph->protocol, hooknum);
816 }
817 return NF_DROP;
818 }
819
820 /* Doesn't cover locally-generated broadcast, so not worth it. */
821 #if 0
822 /* Ignore broadcast: no `connection'. */
823 if ((*pskb)->pkt_type == PACKET_BROADCAST) {
824 printk("Broadcast packet!\n");
825 return NF_ACCEPT;
826 } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
827 == htonl(0x000000FF)) {
828 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
829 NIPQUAD((*pskb)->nh.iph->saddr),
830 NIPQUAD((*pskb)->nh.iph->daddr),
831 (*pskb)->sk, (*pskb)->pkt_type);
832 }
833 #endif
834
835 proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
836
837 /* It may be an special packet, error, unclean...
838 * inverse of the return code tells to the netfilter
839 * core what to do with the packet. */
840 if (proto->error != NULL
841 && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
842 CONNTRACK_STAT_INC(error);
843 CONNTRACK_STAT_INC(invalid);
844 return -ret;
845 }
846
847 if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
848 /* Not valid part of a connection */
849 CONNTRACK_STAT_INC(invalid);
850 return NF_ACCEPT;
851 }
852
853 if (IS_ERR(ct)) {
854 /* Too stressed to deal. */
855 CONNTRACK_STAT_INC(drop);
856 return NF_DROP;
857 }
858
859 IP_NF_ASSERT((*pskb)->nfct);
860
861 ret = proto->packet(ct, *pskb, ctinfo);
862 if (ret < 0) {
863 /* Invalid: inverse of the return code tells
864 * the netfilter core what to do*/
865 nf_conntrack_put((*pskb)->nfct);
866 (*pskb)->nfct = NULL;
867 CONNTRACK_STAT_INC(invalid);
868 return -ret;
869 }
870
871 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
872 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
873
874 return ret;
875 }
876
877 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
878 const struct ip_conntrack_tuple *orig)
879 {
880 return ip_ct_invert_tuple(inverse, orig,
881 __ip_conntrack_proto_find(orig->dst.protonum));
882 }
883
884 /* Would two expected things clash? */
885 static inline int expect_clash(const struct ip_conntrack_expect *a,
886 const struct ip_conntrack_expect *b)
887 {
888 /* Part covered by intersection of masks must be unequal,
889 otherwise they clash */
890 struct ip_conntrack_tuple intersect_mask
891 = { { a->mask.src.ip & b->mask.src.ip,
892 { a->mask.src.u.all & b->mask.src.u.all } },
893 { a->mask.dst.ip & b->mask.dst.ip,
894 { a->mask.dst.u.all & b->mask.dst.u.all },
895 a->mask.dst.protonum & b->mask.dst.protonum } };
896
897 return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
898 }
899
900 static inline int expect_matches(const struct ip_conntrack_expect *a,
901 const struct ip_conntrack_expect *b)
902 {
903 return a->master == b->master
904 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
905 && ip_ct_tuple_equal(&a->mask, &b->mask);
906 }
907
908 /* Generally a bad idea to call this: could have matched already. */
909 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
910 {
911 struct ip_conntrack_expect *i;
912
913 write_lock_bh(&ip_conntrack_lock);
914 /* choose the the oldest expectation to evict */
915 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
916 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
917 ip_ct_unlink_expect(i);
918 write_unlock_bh(&ip_conntrack_lock);
919 ip_conntrack_expect_put(i);
920 return;
921 }
922 }
923 write_unlock_bh(&ip_conntrack_lock);
924 }
925
926 /* We don't increase the master conntrack refcount for non-fulfilled
927 * conntracks. During the conntrack destruction, the expectations are
928 * always killed before the conntrack itself */
929 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
930 {
931 struct ip_conntrack_expect *new;
932
933 new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
934 if (!new) {
935 DEBUGP("expect_related: OOM allocating expect\n");
936 return NULL;
937 }
938 new->master = me;
939 atomic_set(&new->use, 1);
940 return new;
941 }
942
943 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
944 {
945 if (atomic_dec_and_test(&exp->use))
946 kmem_cache_free(ip_conntrack_expect_cachep, exp);
947 }
948
949 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
950 {
951 atomic_inc(&exp->use);
952 exp->master->expecting++;
953 list_add(&exp->list, &ip_conntrack_expect_list);
954
955 init_timer(&exp->timeout);
956 exp->timeout.data = (unsigned long)exp;
957 exp->timeout.function = expectation_timed_out;
958 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
959 add_timer(&exp->timeout);
960
961 exp->id = ++ip_conntrack_expect_next_id;
962 atomic_inc(&exp->use);
963 CONNTRACK_STAT_INC(expect_create);
964 }
965
966 /* Race with expectations being used means we could have none to find; OK. */
967 static void evict_oldest_expect(struct ip_conntrack *master)
968 {
969 struct ip_conntrack_expect *i;
970
971 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
972 if (i->master == master) {
973 if (del_timer(&i->timeout)) {
974 ip_ct_unlink_expect(i);
975 ip_conntrack_expect_put(i);
976 }
977 break;
978 }
979 }
980 }
981
982 static inline int refresh_timer(struct ip_conntrack_expect *i)
983 {
984 if (!del_timer(&i->timeout))
985 return 0;
986
987 i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
988 add_timer(&i->timeout);
989 return 1;
990 }
991
992 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
993 {
994 struct ip_conntrack_expect *i;
995 int ret;
996
997 DEBUGP("ip_conntrack_expect_related %p\n", related_to);
998 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
999 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
1000
1001 write_lock_bh(&ip_conntrack_lock);
1002 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1003 if (expect_matches(i, expect)) {
1004 /* Refresh timer: if it's dying, ignore.. */
1005 if (refresh_timer(i)) {
1006 ret = 0;
1007 goto out;
1008 }
1009 } else if (expect_clash(i, expect)) {
1010 ret = -EBUSY;
1011 goto out;
1012 }
1013 }
1014
1015 /* Will be over limit? */
1016 if (expect->master->helper->max_expected &&
1017 expect->master->expecting >= expect->master->helper->max_expected)
1018 evict_oldest_expect(expect->master);
1019
1020 ip_conntrack_expect_insert(expect);
1021 ip_conntrack_expect_event(IPEXP_NEW, expect);
1022 ret = 0;
1023 out:
1024 write_unlock_bh(&ip_conntrack_lock);
1025 return ret;
1026 }
1027
1028 /* Alter reply tuple (maybe alter helper). This is for NAT, and is
1029 implicitly racy: see __ip_conntrack_confirm */
1030 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1031 const struct ip_conntrack_tuple *newreply)
1032 {
1033 write_lock_bh(&ip_conntrack_lock);
1034 /* Should be unconfirmed, so not in hash table yet */
1035 IP_NF_ASSERT(!is_confirmed(conntrack));
1036
1037 DEBUGP("Altering reply tuple of %p to ", conntrack);
1038 DUMP_TUPLE(newreply);
1039
1040 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1041 if (!conntrack->master && conntrack->expecting == 0)
1042 conntrack->helper = __ip_conntrack_helper_find(newreply);
1043 write_unlock_bh(&ip_conntrack_lock);
1044 }
1045
1046 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1047 {
1048 BUG_ON(me->timeout == 0);
1049 write_lock_bh(&ip_conntrack_lock);
1050 list_add(&me->list, &helpers);
1051 write_unlock_bh(&ip_conntrack_lock);
1052
1053 return 0;
1054 }
1055
1056 struct ip_conntrack_helper *
1057 __ip_conntrack_helper_find_byname(const char *name)
1058 {
1059 struct ip_conntrack_helper *h;
1060
1061 list_for_each_entry(h, &helpers, list) {
1062 if (!strcmp(h->name, name))
1063 return h;
1064 }
1065
1066 return NULL;
1067 }
1068
1069 static inline void unhelp(struct ip_conntrack_tuple_hash *i,
1070 const struct ip_conntrack_helper *me)
1071 {
1072 if (tuplehash_to_ctrack(i)->helper == me) {
1073 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1074 tuplehash_to_ctrack(i)->helper = NULL;
1075 }
1076 }
1077
1078 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1079 {
1080 unsigned int i;
1081 struct ip_conntrack_tuple_hash *h;
1082 struct ip_conntrack_expect *exp, *tmp;
1083
1084 /* Need write lock here, to delete helper. */
1085 write_lock_bh(&ip_conntrack_lock);
1086 list_del(&me->list);
1087
1088 /* Get rid of expectations */
1089 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1090 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1091 ip_ct_unlink_expect(exp);
1092 ip_conntrack_expect_put(exp);
1093 }
1094 }
1095 /* Get rid of expecteds, set helpers to NULL. */
1096 list_for_each_entry(h, &unconfirmed, list)
1097 unhelp(h, me);
1098 for (i = 0; i < ip_conntrack_htable_size; i++) {
1099 list_for_each_entry(h, &ip_conntrack_hash[i], list)
1100 unhelp(h, me);
1101 }
1102 write_unlock_bh(&ip_conntrack_lock);
1103
1104 /* Someone could be still looking at the helper in a bh. */
1105 synchronize_net();
1106 }
1107
1108 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1109 void __ip_ct_refresh_acct(struct ip_conntrack *ct,
1110 enum ip_conntrack_info ctinfo,
1111 const struct sk_buff *skb,
1112 unsigned long extra_jiffies,
1113 int do_acct)
1114 {
1115 int event = 0;
1116
1117 IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1118 IP_NF_ASSERT(skb);
1119
1120 write_lock_bh(&ip_conntrack_lock);
1121
1122 /* Only update if this is not a fixed timeout */
1123 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1124 write_unlock_bh(&ip_conntrack_lock);
1125 return;
1126 }
1127
1128 /* If not in hash table, timer will not be active yet */
1129 if (!is_confirmed(ct)) {
1130 ct->timeout.expires = extra_jiffies;
1131 event = IPCT_REFRESH;
1132 } else {
1133 /* Need del_timer for race avoidance (may already be dying). */
1134 if (del_timer(&ct->timeout)) {
1135 ct->timeout.expires = jiffies + extra_jiffies;
1136 add_timer(&ct->timeout);
1137 event = IPCT_REFRESH;
1138 }
1139 }
1140
1141 #ifdef CONFIG_IP_NF_CT_ACCT
1142 if (do_acct) {
1143 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1144 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1145 ntohs(skb->nh.iph->tot_len);
1146 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1147 || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1148 event |= IPCT_COUNTER_FILLING;
1149 }
1150 #endif
1151
1152 write_unlock_bh(&ip_conntrack_lock);
1153
1154 /* must be unlocked when calling event cache */
1155 if (event)
1156 ip_conntrack_event_cache(event, skb);
1157 }
1158
1159 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1160 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1161 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1162 * in ip_conntrack_core, since we don't want the protocols to autoload
1163 * or depend on ctnetlink */
1164 int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1165 const struct ip_conntrack_tuple *tuple)
1166 {
1167 NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1168 &tuple->src.u.tcp.port);
1169 NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1170 &tuple->dst.u.tcp.port);
1171 return 0;
1172
1173 nfattr_failure:
1174 return -1;
1175 }
1176
1177 int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1178 struct ip_conntrack_tuple *t)
1179 {
1180 if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1181 return -EINVAL;
1182
1183 t->src.u.tcp.port =
1184 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1185 t->dst.u.tcp.port =
1186 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1187
1188 return 0;
1189 }
1190 #endif
1191
1192 /* Returns new sk_buff, or NULL */
1193 struct sk_buff *
1194 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1195 {
1196 skb_orphan(skb);
1197
1198 local_bh_disable();
1199 skb = ip_defrag(skb, user);
1200 local_bh_enable();
1201
1202 if (skb)
1203 ip_send_check(skb->nh.iph);
1204 return skb;
1205 }
1206
1207 /* Used by ipt_REJECT. */
1208 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1209 {
1210 struct ip_conntrack *ct;
1211 enum ip_conntrack_info ctinfo;
1212
1213 /* This ICMP is in reverse direction to the packet which caused it */
1214 ct = ip_conntrack_get(skb, &ctinfo);
1215
1216 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1217 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1218 else
1219 ctinfo = IP_CT_RELATED;
1220
1221 /* Attach to new skbuff, and increment count */
1222 nskb->nfct = &ct->ct_general;
1223 nskb->nfctinfo = ctinfo;
1224 nf_conntrack_get(nskb->nfct);
1225 }
1226
1227 /* Bring out ya dead! */
1228 static struct ip_conntrack *
1229 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1230 void *data, unsigned int *bucket)
1231 {
1232 struct ip_conntrack_tuple_hash *h;
1233 struct ip_conntrack *ct;
1234
1235 write_lock_bh(&ip_conntrack_lock);
1236 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1237 list_for_each_entry(h, &ip_conntrack_hash[*bucket], list) {
1238 ct = tuplehash_to_ctrack(h);
1239 if (iter(ct, data))
1240 goto found;
1241 }
1242 }
1243 list_for_each_entry(h, &unconfirmed, list) {
1244 ct = tuplehash_to_ctrack(h);
1245 if (iter(ct, data))
1246 goto found;
1247 }
1248 write_unlock_bh(&ip_conntrack_lock);
1249 return NULL;
1250
1251 found:
1252 atomic_inc(&ct->ct_general.use);
1253 write_unlock_bh(&ip_conntrack_lock);
1254 return ct;
1255 }
1256
1257 void
1258 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1259 {
1260 struct ip_conntrack *ct;
1261 unsigned int bucket = 0;
1262
1263 while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
1264 /* Time to push up daises... */
1265 if (del_timer(&ct->timeout))
1266 death_by_timeout((unsigned long)ct);
1267 /* ... else the timer will get him soon. */
1268
1269 ip_conntrack_put(ct);
1270 }
1271 }
1272
1273 /* Fast function for those who don't want to parse /proc (and I don't
1274 blame them). */
1275 /* Reversing the socket's dst/src point of view gives us the reply
1276 mapping. */
1277 static int
1278 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1279 {
1280 struct inet_sock *inet = inet_sk(sk);
1281 struct ip_conntrack_tuple_hash *h;
1282 struct ip_conntrack_tuple tuple;
1283
1284 IP_CT_TUPLE_U_BLANK(&tuple);
1285 tuple.src.ip = inet->rcv_saddr;
1286 tuple.src.u.tcp.port = inet->sport;
1287 tuple.dst.ip = inet->daddr;
1288 tuple.dst.u.tcp.port = inet->dport;
1289 tuple.dst.protonum = IPPROTO_TCP;
1290
1291 /* We only do TCP at the moment: is there a better way? */
1292 if (strcmp(sk->sk_prot->name, "TCP")) {
1293 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1294 return -ENOPROTOOPT;
1295 }
1296
1297 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1298 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1299 *len, sizeof(struct sockaddr_in));
1300 return -EINVAL;
1301 }
1302
1303 h = ip_conntrack_find_get(&tuple, NULL);
1304 if (h) {
1305 struct sockaddr_in sin;
1306 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1307
1308 sin.sin_family = AF_INET;
1309 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1310 .tuple.dst.u.tcp.port;
1311 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1312 .tuple.dst.ip;
1313 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
1314
1315 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1316 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1317 ip_conntrack_put(ct);
1318 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1319 return -EFAULT;
1320 else
1321 return 0;
1322 }
1323 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1324 NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1325 NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1326 return -ENOENT;
1327 }
1328
1329 static struct nf_sockopt_ops so_getorigdst = {
1330 .pf = PF_INET,
1331 .get_optmin = SO_ORIGINAL_DST,
1332 .get_optmax = SO_ORIGINAL_DST+1,
1333 .get = &getorigdst,
1334 };
1335
1336 static int kill_all(struct ip_conntrack *i, void *data)
1337 {
1338 return 1;
1339 }
1340
1341 void ip_conntrack_flush(void)
1342 {
1343 ip_ct_iterate_cleanup(kill_all, NULL);
1344 }
1345
1346 static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
1347 {
1348 if (vmalloced)
1349 vfree(hash);
1350 else
1351 free_pages((unsigned long)hash,
1352 get_order(sizeof(struct list_head) * size));
1353 }
1354
1355 /* Mishearing the voices in his head, our hero wonders how he's
1356 supposed to kill the mall. */
1357 void ip_conntrack_cleanup(void)
1358 {
1359 ip_ct_attach = NULL;
1360
1361 /* This makes sure all current packets have passed through
1362 netfilter framework. Roll on, two-stage module
1363 delete... */
1364 synchronize_net();
1365
1366 ip_ct_event_cache_flush();
1367 i_see_dead_people:
1368 ip_conntrack_flush();
1369 if (atomic_read(&ip_conntrack_count) != 0) {
1370 schedule();
1371 goto i_see_dead_people;
1372 }
1373 /* wait until all references to ip_conntrack_untracked are dropped */
1374 while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1375 schedule();
1376
1377 kmem_cache_destroy(ip_conntrack_cachep);
1378 kmem_cache_destroy(ip_conntrack_expect_cachep);
1379 free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1380 ip_conntrack_htable_size);
1381 nf_unregister_sockopt(&so_getorigdst);
1382 }
1383
1384 static struct list_head *alloc_hashtable(int size, int *vmalloced)
1385 {
1386 struct list_head *hash;
1387 unsigned int i;
1388
1389 *vmalloced = 0;
1390 hash = (void*)__get_free_pages(GFP_KERNEL,
1391 get_order(sizeof(struct list_head)
1392 * size));
1393 if (!hash) {
1394 *vmalloced = 1;
1395 printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
1396 hash = vmalloc(sizeof(struct list_head) * size);
1397 }
1398
1399 if (hash)
1400 for (i = 0; i < size; i++)
1401 INIT_LIST_HEAD(&hash[i]);
1402
1403 return hash;
1404 }
1405
1406 static int set_hashsize(const char *val, struct kernel_param *kp)
1407 {
1408 int i, bucket, hashsize, vmalloced;
1409 int old_vmalloced, old_size;
1410 int rnd;
1411 struct list_head *hash, *old_hash;
1412 struct ip_conntrack_tuple_hash *h;
1413
1414 /* On boot, we can set this without any fancy locking. */
1415 if (!ip_conntrack_htable_size)
1416 return param_set_int(val, kp);
1417
1418 hashsize = simple_strtol(val, NULL, 0);
1419 if (!hashsize)
1420 return -EINVAL;
1421
1422 hash = alloc_hashtable(hashsize, &vmalloced);
1423 if (!hash)
1424 return -ENOMEM;
1425
1426 /* We have to rehash for the new table anyway, so we also can
1427 * use a new random seed */
1428 get_random_bytes(&rnd, 4);
1429
1430 write_lock_bh(&ip_conntrack_lock);
1431 for (i = 0; i < ip_conntrack_htable_size; i++) {
1432 while (!list_empty(&ip_conntrack_hash[i])) {
1433 h = list_entry(ip_conntrack_hash[i].next,
1434 struct ip_conntrack_tuple_hash, list);
1435 list_del(&h->list);
1436 bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1437 list_add_tail(&h->list, &hash[bucket]);
1438 }
1439 }
1440 old_size = ip_conntrack_htable_size;
1441 old_vmalloced = ip_conntrack_vmalloc;
1442 old_hash = ip_conntrack_hash;
1443
1444 ip_conntrack_htable_size = hashsize;
1445 ip_conntrack_vmalloc = vmalloced;
1446 ip_conntrack_hash = hash;
1447 ip_conntrack_hash_rnd = rnd;
1448 write_unlock_bh(&ip_conntrack_lock);
1449
1450 free_conntrack_hash(old_hash, old_vmalloced, old_size);
1451 return 0;
1452 }
1453
1454 module_param_call(hashsize, set_hashsize, param_get_uint,
1455 &ip_conntrack_htable_size, 0600);
1456
1457 int __init ip_conntrack_init(void)
1458 {
1459 unsigned int i;
1460 int ret;
1461
1462 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1463 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1464 if (!ip_conntrack_htable_size) {
1465 ip_conntrack_htable_size
1466 = (((num_physpages << PAGE_SHIFT) / 16384)
1467 / sizeof(struct list_head));
1468 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1469 ip_conntrack_htable_size = 8192;
1470 if (ip_conntrack_htable_size < 16)
1471 ip_conntrack_htable_size = 16;
1472 }
1473 ip_conntrack_max = 8 * ip_conntrack_htable_size;
1474
1475 printk("ip_conntrack version %s (%u buckets, %d max)"
1476 " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1477 ip_conntrack_htable_size, ip_conntrack_max,
1478 sizeof(struct ip_conntrack));
1479
1480 ret = nf_register_sockopt(&so_getorigdst);
1481 if (ret != 0) {
1482 printk(KERN_ERR "Unable to register netfilter socket option\n");
1483 return ret;
1484 }
1485
1486 ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
1487 &ip_conntrack_vmalloc);
1488 if (!ip_conntrack_hash) {
1489 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1490 goto err_unreg_sockopt;
1491 }
1492
1493 ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1494 sizeof(struct ip_conntrack), 0,
1495 0, NULL, NULL);
1496 if (!ip_conntrack_cachep) {
1497 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1498 goto err_free_hash;
1499 }
1500
1501 ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1502 sizeof(struct ip_conntrack_expect),
1503 0, 0, NULL, NULL);
1504 if (!ip_conntrack_expect_cachep) {
1505 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1506 goto err_free_conntrack_slab;
1507 }
1508
1509 /* Don't NEED lock here, but good form anyway. */
1510 write_lock_bh(&ip_conntrack_lock);
1511 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1512 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1513 /* Sew in builtin protocols. */
1514 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1515 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1516 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1517 write_unlock_bh(&ip_conntrack_lock);
1518
1519 /* For use by ipt_REJECT */
1520 ip_ct_attach = ip_conntrack_attach;
1521
1522 /* Set up fake conntrack:
1523 - to never be deleted, not in any hashes */
1524 atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1525 /* - and look it like as a confirmed connection */
1526 set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1527
1528 return ret;
1529
1530 err_free_conntrack_slab:
1531 kmem_cache_destroy(ip_conntrack_cachep);
1532 err_free_hash:
1533 free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1534 ip_conntrack_htable_size);
1535 err_unreg_sockopt:
1536 nf_unregister_sockopt(&so_getorigdst);
1537
1538 return -ENOMEM;
1539 }