]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - net/ipv4/netfilter/ip_conntrack_core.c
[PATCH] slab: remove kmem_cache_t
[mirror_ubuntu-bionic-kernel.git] / net / ipv4 / netfilter / ip_conntrack_core.c
CommitLineData
1da177e4
LT
1/* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
4
5/* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 *
12 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13 * - new API and handling of conntrack/nat helpers
14 * - now capable of multiple expectations for one master
15 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16 * - add usage/reference counts to ip_conntrack_expect
17 * - export ip_conntrack[_expect]_{find_get,put} functions
18 * */
19
1da177e4
LT
20#include <linux/types.h>
21#include <linux/icmp.h>
22#include <linux/ip.h>
23#include <linux/netfilter.h>
24#include <linux/netfilter_ipv4.h>
25#include <linux/module.h>
26#include <linux/skbuff.h>
27#include <linux/proc_fs.h>
28#include <linux/vmalloc.h>
29#include <net/checksum.h>
30#include <net/ip.h>
31#include <linux/stddef.h>
32#include <linux/sysctl.h>
33#include <linux/slab.h>
34#include <linux/random.h>
35#include <linux/jhash.h>
36#include <linux/err.h>
37#include <linux/percpu.h>
38#include <linux/moduleparam.h>
ac3247ba 39#include <linux/notifier.h>
1da177e4 40
e45b1be8 41/* ip_conntrack_lock protects the main hash table, protocol/helper/expected
1da177e4 42 registrations, conntrack timers*/
1da177e4
LT
43#include <linux/netfilter_ipv4/ip_conntrack.h>
44#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
45#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
46#include <linux/netfilter_ipv4/ip_conntrack_core.h>
1da177e4 47
eed75f19 48#define IP_CONNTRACK_VERSION "2.4"
1da177e4
LT
49
50#if 0
51#define DEBUGP printk
52#else
53#define DEBUGP(format, args...)
54#endif
55
e45b1be8 56DEFINE_RWLOCK(ip_conntrack_lock);
1da177e4
LT
57
58/* ip_conntrack_standalone needs this */
59atomic_t ip_conntrack_count = ATOMIC_INIT(0);
60
61void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
62LIST_HEAD(ip_conntrack_expect_list);
1192e403 63struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO] __read_mostly;
1da177e4 64static LIST_HEAD(helpers);
94aec08e
BH
65unsigned int ip_conntrack_htable_size __read_mostly = 0;
66int ip_conntrack_max __read_mostly;
1192e403 67struct list_head *ip_conntrack_hash __read_mostly;
e18b890b
CL
68static struct kmem_cache *ip_conntrack_cachep __read_mostly;
69static struct kmem_cache *ip_conntrack_expect_cachep __read_mostly;
1da177e4 70struct ip_conntrack ip_conntrack_untracked;
94aec08e 71unsigned int ip_ct_log_invalid __read_mostly;
1da177e4 72static LIST_HEAD(unconfirmed);
1192e403 73static int ip_conntrack_vmalloc __read_mostly;
1da177e4 74
4e3882f7
PNA
75static unsigned int ip_conntrack_next_id;
76static unsigned int ip_conntrack_expect_next_id;
ac3247ba 77#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
e041c683
AS
78ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
79ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
ac3247ba
HW
80
81DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
82
a86888b9
PM
83/* deliver cached events and clear cache entry - must be called with locally
84 * disabled softirqs */
85static inline void
86__ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
ac3247ba 87{
a86888b9 88 DEBUGP("ecache: delivering events for %p\n", ecache->ct);
ac3247ba 89 if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
e041c683 90 atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events,
ac3247ba
HW
91 ecache->ct);
92 ecache->events = 0;
a86888b9
PM
93 ip_conntrack_put(ecache->ct);
94 ecache->ct = NULL;
ac3247ba
HW
95}
96
97/* Deliver all cached events for a particular conntrack. This is called
98 * by code prior to async packet handling or freeing the skb */
a86888b9 99void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
ac3247ba 100{
a86888b9
PM
101 struct ip_conntrack_ecache *ecache;
102
103 local_bh_disable();
104 ecache = &__get_cpu_var(ip_conntrack_ecache);
105 if (ecache->ct == ct)
106 __ip_ct_deliver_cached_events(ecache);
107 local_bh_enable();
108}
ac3247ba 109
a86888b9
PM
110void __ip_ct_event_cache_init(struct ip_conntrack *ct)
111{
112 struct ip_conntrack_ecache *ecache;
ac3247ba 113
a86888b9
PM
114 /* take care of delivering potentially old events */
115 ecache = &__get_cpu_var(ip_conntrack_ecache);
116 BUG_ON(ecache->ct == ct);
117 if (ecache->ct)
118 __ip_ct_deliver_cached_events(ecache);
119 /* initialize for this conntrack/packet */
120 ecache->ct = ct;
121 nf_conntrack_get(&ct->ct_general);
ac3247ba
HW
122}
123
a86888b9
PM
124/* flush the event cache - touches other CPU's data and must not be called while
125 * packets are still passing through the code */
126static void ip_ct_event_cache_flush(void)
ac3247ba 127{
a86888b9
PM
128 struct ip_conntrack_ecache *ecache;
129 int cpu;
ac3247ba 130
6f912042 131 for_each_possible_cpu(cpu) {
a86888b9
PM
132 ecache = &per_cpu(ip_conntrack_ecache, cpu);
133 if (ecache->ct)
ac3247ba 134 ip_conntrack_put(ecache->ct);
ac3247ba
HW
135 }
136}
a86888b9
PM
137#else
138static inline void ip_ct_event_cache_flush(void) {}
ac3247ba
HW
139#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
140
1da177e4
LT
141DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
142
1da177e4
LT
143static int ip_conntrack_hash_rnd_initted;
144static unsigned int ip_conntrack_hash_rnd;
145
eed75f19
HW
146static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
147 unsigned int size, unsigned int rnd)
1da177e4 148{
cdcb71bf
AV
149 return (jhash_3words((__force u32)tuple->src.ip,
150 ((__force u32)tuple->dst.ip ^ tuple->dst.protonum),
1da177e4 151 (tuple->src.u.all | (tuple->dst.u.all << 16)),
eed75f19
HW
152 rnd) % size);
153}
154
155static u_int32_t
156hash_conntrack(const struct ip_conntrack_tuple *tuple)
157{
158 return __hash_conntrack(tuple, ip_conntrack_htable_size,
159 ip_conntrack_hash_rnd);
1da177e4
LT
160}
161
162int
163ip_ct_get_tuple(const struct iphdr *iph,
164 const struct sk_buff *skb,
165 unsigned int dataoff,
166 struct ip_conntrack_tuple *tuple,
167 const struct ip_conntrack_protocol *protocol)
168{
169 /* Never happen */
170 if (iph->frag_off & htons(IP_OFFSET)) {
171 printk("ip_conntrack_core: Frag of proto %u.\n",
172 iph->protocol);
173 return 0;
174 }
175
176 tuple->src.ip = iph->saddr;
177 tuple->dst.ip = iph->daddr;
178 tuple->dst.protonum = iph->protocol;
179 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
180
181 return protocol->pkt_to_tuple(skb, dataoff, tuple);
182}
183
184int
185ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
186 const struct ip_conntrack_tuple *orig,
187 const struct ip_conntrack_protocol *protocol)
188{
189 inverse->src.ip = orig->dst.ip;
190 inverse->dst.ip = orig->src.ip;
191 inverse->dst.protonum = orig->dst.protonum;
192 inverse->dst.dir = !orig->dst.dir;
193
194 return protocol->invert_tuple(inverse, orig);
195}
196
197
198/* ip_conntrack_expect helper functions */
49719eb3 199void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
1da177e4 200{
4acdbdbe 201 IP_NF_ASSERT(!timer_pending(&exp->timeout));
1da177e4 202 list_del(&exp->list);
4acdbdbe 203 CONNTRACK_STAT_INC(expect_delete);
1da177e4 204 exp->master->expecting--;
37012f7f 205 ip_conntrack_expect_put(exp);
1da177e4
LT
206}
207
208static void expectation_timed_out(unsigned long ul_expect)
209{
210 struct ip_conntrack_expect *exp = (void *)ul_expect;
211
e45b1be8 212 write_lock_bh(&ip_conntrack_lock);
49719eb3 213 ip_ct_unlink_expect(exp);
e45b1be8 214 write_unlock_bh(&ip_conntrack_lock);
4acdbdbe 215 ip_conntrack_expect_put(exp);
1da177e4
LT
216}
217
080774a2
HW
218struct ip_conntrack_expect *
219__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
220{
221 struct ip_conntrack_expect *i;
222
223 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
2e47c264 224 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask))
080774a2 225 return i;
080774a2
HW
226 }
227 return NULL;
228}
229
230/* Just find a expectation corresponding to a tuple. */
231struct ip_conntrack_expect *
468ec44b 232ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
080774a2
HW
233{
234 struct ip_conntrack_expect *i;
235
236 read_lock_bh(&ip_conntrack_lock);
237 i = __ip_conntrack_expect_find(tuple);
2e47c264
YK
238 if (i)
239 atomic_inc(&i->use);
080774a2
HW
240 read_unlock_bh(&ip_conntrack_lock);
241
242 return i;
243}
244
1da177e4
LT
245/* If an expectation for this connection is found, it gets delete from
246 * global list then returned. */
247static struct ip_conntrack_expect *
248find_expectation(const struct ip_conntrack_tuple *tuple)
249{
250 struct ip_conntrack_expect *i;
251
252 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
253 /* If master is not in hash table yet (ie. packet hasn't left
254 this machine yet), how can other end know about expected?
255 Hence these are not the droids you are looking for (if
256 master ct never got confirmed, we'd hold a reference to it
257 and weird things would happen to future packets). */
258 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
2248bcfc
PM
259 && is_confirmed(i->master)) {
260 if (i->flags & IP_CT_EXPECT_PERMANENT) {
261 atomic_inc(&i->use);
262 return i;
263 } else if (del_timer(&i->timeout)) {
49719eb3 264 ip_ct_unlink_expect(i);
2248bcfc
PM
265 return i;
266 }
1da177e4
LT
267 }
268 }
269 return NULL;
270}
271
272/* delete all expectations for this conntrack */
080774a2 273void ip_ct_remove_expectations(struct ip_conntrack *ct)
1da177e4
LT
274{
275 struct ip_conntrack_expect *i, *tmp;
276
277 /* Optimization: most connection never expect any others. */
278 if (ct->expecting == 0)
279 return;
280
281 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
282 if (i->master == ct && del_timer(&i->timeout)) {
49719eb3 283 ip_ct_unlink_expect(i);
4acdbdbe 284 ip_conntrack_expect_put(i);
1da177e4
LT
285 }
286 }
287}
288
289static void
290clean_from_lists(struct ip_conntrack *ct)
291{
1da177e4 292 DEBUGP("clean_from_lists(%p)\n", ct);
df0933dc
PM
293 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
294 list_del(&ct->tuplehash[IP_CT_DIR_REPLY].list);
1da177e4
LT
295
296 /* Destroy all pending expectations */
080774a2 297 ip_ct_remove_expectations(ct);
1da177e4
LT
298}
299
300static void
301destroy_conntrack(struct nf_conntrack *nfct)
302{
303 struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
304 struct ip_conntrack_protocol *proto;
4c5de695 305 struct ip_conntrack_helper *helper;
1da177e4
LT
306
307 DEBUGP("destroy_conntrack(%p)\n", ct);
308 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
309 IP_NF_ASSERT(!timer_pending(&ct->timeout));
310
14a50bba 311 ip_conntrack_event(IPCT_DESTROY, ct);
ac3247ba
HW
312 set_bit(IPS_DYING_BIT, &ct->status);
313
4c5de695
PM
314 helper = ct->helper;
315 if (helper && helper->destroy)
316 helper->destroy(ct);
317
1da177e4
LT
318 /* To make sure we don't get any weird locking issues here:
319 * destroy_conntrack() MUST NOT be called with a write lock
320 * to ip_conntrack_lock!!! -HW */
080774a2 321 proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
1da177e4
LT
322 if (proto && proto->destroy)
323 proto->destroy(ct);
324
325 if (ip_conntrack_destroyed)
326 ip_conntrack_destroyed(ct);
327
e45b1be8 328 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
329 /* Expectations will have been removed in clean_from_lists,
330 * except TFTP can create an expectation on the first packet,
331 * before connection is in the list, so we need to clean here,
332 * too. */
080774a2 333 ip_ct_remove_expectations(ct);
1da177e4
LT
334
335 /* We overload first tuple to link into unconfirmed list. */
336 if (!is_confirmed(ct)) {
337 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
338 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
339 }
340
341 CONNTRACK_STAT_INC(delete);
e45b1be8 342 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
343
344 if (ct->master)
345 ip_conntrack_put(ct->master);
346
347 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
080774a2 348 ip_conntrack_free(ct);
1da177e4
LT
349}
350
351static void death_by_timeout(unsigned long ul_conntrack)
352{
353 struct ip_conntrack *ct = (void *)ul_conntrack;
354
e45b1be8 355 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
356 /* Inside lock so preempt is disabled on module removal path.
357 * Otherwise we can get spurious warnings. */
358 CONNTRACK_STAT_INC(delete_list);
359 clean_from_lists(ct);
e45b1be8 360 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
361 ip_conntrack_put(ct);
362}
363
080774a2 364struct ip_conntrack_tuple_hash *
1da177e4
LT
365__ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
366 const struct ip_conntrack *ignored_conntrack)
367{
368 struct ip_conntrack_tuple_hash *h;
369 unsigned int hash = hash_conntrack(tuple);
370
1da177e4 371 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
df0933dc
PM
372 if (tuplehash_to_ctrack(h) != ignored_conntrack &&
373 ip_ct_tuple_equal(tuple, &h->tuple)) {
1da177e4
LT
374 CONNTRACK_STAT_INC(found);
375 return h;
376 }
377 CONNTRACK_STAT_INC(searched);
378 }
379
380 return NULL;
381}
382
383/* Find a connection corresponding to a tuple. */
384struct ip_conntrack_tuple_hash *
385ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
386 const struct ip_conntrack *ignored_conntrack)
387{
388 struct ip_conntrack_tuple_hash *h;
389
e45b1be8 390 read_lock_bh(&ip_conntrack_lock);
1da177e4
LT
391 h = __ip_conntrack_find(tuple, ignored_conntrack);
392 if (h)
393 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
e45b1be8 394 read_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
395
396 return h;
397}
398
080774a2
HW
399static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
400 unsigned int hash,
401 unsigned int repl_hash)
402{
403 ct->id = ++ip_conntrack_next_id;
df0933dc
PM
404 list_add(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list,
405 &ip_conntrack_hash[hash]);
406 list_add(&ct->tuplehash[IP_CT_DIR_REPLY].list,
407 &ip_conntrack_hash[repl_hash]);
080774a2
HW
408}
409
410void ip_conntrack_hash_insert(struct ip_conntrack *ct)
411{
412 unsigned int hash, repl_hash;
413
414 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
415 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
416
417 write_lock_bh(&ip_conntrack_lock);
418 __ip_conntrack_hash_insert(ct, hash, repl_hash);
419 write_unlock_bh(&ip_conntrack_lock);
420}
421
1da177e4
LT
422/* Confirm a connection given skb; places it in hash table */
423int
424__ip_conntrack_confirm(struct sk_buff **pskb)
425{
426 unsigned int hash, repl_hash;
df0933dc 427 struct ip_conntrack_tuple_hash *h;
1da177e4
LT
428 struct ip_conntrack *ct;
429 enum ip_conntrack_info ctinfo;
430
431 ct = ip_conntrack_get(*pskb, &ctinfo);
432
433 /* ipt_REJECT uses ip_conntrack_attach to attach related
434 ICMP/TCP RST packets in other direction. Actual packet
435 which created connection will be IP_CT_NEW or for an
436 expected connection, IP_CT_RELATED. */
437 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
438 return NF_ACCEPT;
439
440 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
441 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
442
443 /* We're not in hash table, and we refuse to set up related
444 connections for unconfirmed conns. But packet copies and
445 REJECT will give spurious warnings here. */
446 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
447
448 /* No external references means noone else could have
449 confirmed us. */
450 IP_NF_ASSERT(!is_confirmed(ct));
451 DEBUGP("Confirming conntrack %p\n", ct);
452
e45b1be8 453 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
454
455 /* See if there's one in the list already, including reverse:
456 NAT could have grabbed it without realizing, since we're
457 not in the hash. If there is, we lost race. */
df0933dc
PM
458 list_for_each_entry(h, &ip_conntrack_hash[hash], list)
459 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
460 &h->tuple))
461 goto out;
462 list_for_each_entry(h, &ip_conntrack_hash[repl_hash], list)
463 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
464 &h->tuple))
465 goto out;
1da177e4 466
df0933dc
PM
467 /* Remove from unconfirmed list */
468 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
469
470 __ip_conntrack_hash_insert(ct, hash, repl_hash);
471 /* Timer relative to confirmation time, not original
472 setting time, otherwise we'd get timer wrap in
473 weird delay cases. */
474 ct->timeout.expires += jiffies;
475 add_timer(&ct->timeout);
476 atomic_inc(&ct->ct_general.use);
477 set_bit(IPS_CONFIRMED_BIT, &ct->status);
478 CONNTRACK_STAT_INC(insert);
479 write_unlock_bh(&ip_conntrack_lock);
480 if (ct->helper)
481 ip_conntrack_event_cache(IPCT_HELPER, *pskb);
ac3247ba 482#ifdef CONFIG_IP_NF_NAT_NEEDED
df0933dc
PM
483 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
484 test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
485 ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
ac3247ba 486#endif
df0933dc
PM
487 ip_conntrack_event_cache(master_ct(ct) ?
488 IPCT_RELATED : IPCT_NEW, *pskb);
ac3247ba 489
df0933dc 490 return NF_ACCEPT;
1da177e4 491
df0933dc 492out:
1da177e4 493 CONNTRACK_STAT_INC(insert_failed);
e45b1be8 494 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
495 return NF_DROP;
496}
497
498/* Returns true if a connection correspondings to the tuple (required
499 for NAT). */
500int
501ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
502 const struct ip_conntrack *ignored_conntrack)
503{
504 struct ip_conntrack_tuple_hash *h;
505
e45b1be8 506 read_lock_bh(&ip_conntrack_lock);
1da177e4 507 h = __ip_conntrack_find(tuple, ignored_conntrack);
e45b1be8 508 read_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
509
510 return h != NULL;
511}
512
513/* There's a small race here where we may free a just-assured
514 connection. Too bad: we're in trouble anyway. */
1da177e4
LT
515static int early_drop(struct list_head *chain)
516{
517 /* Traverse backwards: gives us oldest, which is roughly LRU */
518 struct ip_conntrack_tuple_hash *h;
df0933dc 519 struct ip_conntrack *ct = NULL, *tmp;
1da177e4
LT
520 int dropped = 0;
521
e45b1be8 522 read_lock_bh(&ip_conntrack_lock);
df0933dc
PM
523 list_for_each_entry_reverse(h, chain, list) {
524 tmp = tuplehash_to_ctrack(h);
525 if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) {
526 ct = tmp;
527 atomic_inc(&ct->ct_general.use);
528 break;
529 }
1da177e4 530 }
e45b1be8 531 read_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
532
533 if (!ct)
534 return dropped;
535
536 if (del_timer(&ct->timeout)) {
537 death_by_timeout((unsigned long)ct);
538 dropped = 1;
539 CONNTRACK_STAT_INC(early_drop);
540 }
541 ip_conntrack_put(ct);
542 return dropped;
543}
544
080774a2
HW
545static struct ip_conntrack_helper *
546__ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
1da177e4 547{
df0933dc
PM
548 struct ip_conntrack_helper *h;
549
550 list_for_each_entry(h, &helpers, list) {
551 if (ip_ct_tuple_mask_cmp(tuple, &h->tuple, &h->mask))
552 return h;
553 }
554 return NULL;
1da177e4
LT
555}
556
080774a2
HW
557struct ip_conntrack_helper *
558ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
559{
560 struct ip_conntrack_helper *helper;
561
562 /* need ip_conntrack_lock to assure that helper exists until
563 * try_module_get() is called */
564 read_lock_bh(&ip_conntrack_lock);
565
566 helper = __ip_conntrack_helper_find(tuple);
567 if (helper) {
568 /* need to increase module usage count to assure helper will
569 * not go away while the caller is e.g. busy putting a
570 * conntrack in the hash that uses the helper */
571 if (!try_module_get(helper->me))
572 helper = NULL;
573 }
574
575 read_unlock_bh(&ip_conntrack_lock);
576
577 return helper;
578}
579
580void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
581{
582 module_put(helper->me);
583}
584
585struct ip_conntrack_protocol *
586__ip_conntrack_proto_find(u_int8_t protocol)
587{
588 return ip_ct_protos[protocol];
589}
590
591/* this is guaranteed to always return a valid protocol helper, since
592 * it falls back to generic_protocol */
593struct ip_conntrack_protocol *
594ip_conntrack_proto_find_get(u_int8_t protocol)
595{
596 struct ip_conntrack_protocol *p;
597
598 preempt_disable();
599 p = __ip_conntrack_proto_find(protocol);
600 if (p) {
601 if (!try_module_get(p->me))
602 p = &ip_conntrack_generic_protocol;
603 }
604 preempt_enable();
605
606 return p;
607}
608
609void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
610{
611 module_put(p->me);
612}
613
614struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
615 struct ip_conntrack_tuple *repl)
1da177e4
LT
616{
617 struct ip_conntrack *conntrack;
1da177e4
LT
618
619 if (!ip_conntrack_hash_rnd_initted) {
620 get_random_bytes(&ip_conntrack_hash_rnd, 4);
621 ip_conntrack_hash_rnd_initted = 1;
622 }
623
5251e2d2
PNA
624 /* We don't want any race condition at early drop stage */
625 atomic_inc(&ip_conntrack_count);
626
1da177e4 627 if (ip_conntrack_max
5251e2d2 628 && atomic_read(&ip_conntrack_count) > ip_conntrack_max) {
080774a2 629 unsigned int hash = hash_conntrack(orig);
1da177e4
LT
630 /* Try dropping from this hash chain. */
631 if (!early_drop(&ip_conntrack_hash[hash])) {
5251e2d2 632 atomic_dec(&ip_conntrack_count);
1da177e4
LT
633 if (net_ratelimit())
634 printk(KERN_WARNING
635 "ip_conntrack: table full, dropping"
636 " packet.\n");
637 return ERR_PTR(-ENOMEM);
638 }
639 }
640
1da177e4
LT
641 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
642 if (!conntrack) {
643 DEBUGP("Can't allocate conntrack.\n");
5251e2d2 644 atomic_dec(&ip_conntrack_count);
7663f188 645 return ERR_PTR(-ENOMEM);
1da177e4
LT
646 }
647
648 memset(conntrack, 0, sizeof(*conntrack));
649 atomic_set(&conntrack->ct_general.use, 1);
650 conntrack->ct_general.destroy = destroy_conntrack;
080774a2
HW
651 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
652 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
1da177e4
LT
653 /* Don't set timer yet: wait for confirmation */
654 init_timer(&conntrack->timeout);
655 conntrack->timeout.data = (unsigned long)conntrack;
656 conntrack->timeout.function = death_by_timeout;
657
080774a2
HW
658 return conntrack;
659}
660
661void
662ip_conntrack_free(struct ip_conntrack *conntrack)
663{
664 atomic_dec(&ip_conntrack_count);
665 kmem_cache_free(ip_conntrack_cachep, conntrack);
666}
667
668/* Allocate a new conntrack: we return -ENOMEM if classification
669 * failed due to stress. Otherwise it really is unclassifiable */
670static struct ip_conntrack_tuple_hash *
671init_conntrack(struct ip_conntrack_tuple *tuple,
672 struct ip_conntrack_protocol *protocol,
673 struct sk_buff *skb)
674{
675 struct ip_conntrack *conntrack;
676 struct ip_conntrack_tuple repl_tuple;
677 struct ip_conntrack_expect *exp;
678
679 if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
680 DEBUGP("Can't invert tuple.\n");
681 return NULL;
682 }
683
7663f188
YK
684 conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
685 if (conntrack == NULL || IS_ERR(conntrack))
686 return (struct ip_conntrack_tuple_hash *)conntrack;
080774a2
HW
687
688 if (!protocol->new(conntrack, skb)) {
689 ip_conntrack_free(conntrack);
690 return NULL;
691 }
692
e45b1be8 693 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
694 exp = find_expectation(tuple);
695
696 if (exp) {
697 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
698 conntrack, exp);
699 /* Welcome, Mr. Bond. We've been expecting you... */
700 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
701 conntrack->master = exp->master;
7cee432a 702#ifdef CONFIG_IP_NF_CONNTRACK_MARK
1da177e4 703 conntrack->mark = exp->master->mark;
1f494c0e
HW
704#endif
705#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
706 defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
707 /* this is ugly, but there is no other place where to put it */
708 conntrack->nat.masq_index = exp->master->nat.masq_index;
7c9728c3
JM
709#endif
710#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
711 conntrack->secmark = exp->master->secmark;
1da177e4
LT
712#endif
713 nf_conntrack_get(&conntrack->master->ct_general);
714 CONNTRACK_STAT_INC(expect_new);
715 } else {
080774a2 716 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
1da177e4
LT
717
718 CONNTRACK_STAT_INC(new);
719 }
720
721 /* Overload tuple linked list to put us in unconfirmed list. */
722 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
723
e45b1be8 724 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
725
726 if (exp) {
727 if (exp->expectfn)
728 exp->expectfn(conntrack, exp);
4acdbdbe 729 ip_conntrack_expect_put(exp);
1da177e4
LT
730 }
731
732 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
733}
734
735/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
736static inline struct ip_conntrack *
737resolve_normal_ct(struct sk_buff *skb,
738 struct ip_conntrack_protocol *proto,
739 int *set_reply,
740 unsigned int hooknum,
741 enum ip_conntrack_info *ctinfo)
742{
743 struct ip_conntrack_tuple tuple;
744 struct ip_conntrack_tuple_hash *h;
745 struct ip_conntrack *ct;
746
747 IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
748
749 if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
750 &tuple,proto))
751 return NULL;
752
753 /* look for tuple match */
754 h = ip_conntrack_find_get(&tuple, NULL);
755 if (!h) {
756 h = init_conntrack(&tuple, proto, skb);
757 if (!h)
758 return NULL;
759 if (IS_ERR(h))
760 return (void *)h;
761 }
762 ct = tuplehash_to_ctrack(h);
763
764 /* It exists; we have (non-exclusive) reference. */
765 if (DIRECTION(h) == IP_CT_DIR_REPLY) {
766 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
767 /* Please set reply bit if this packet OK */
768 *set_reply = 1;
769 } else {
770 /* Once we've had two way comms, always ESTABLISHED. */
771 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
772 DEBUGP("ip_conntrack_in: normal packet for %p\n",
773 ct);
774 *ctinfo = IP_CT_ESTABLISHED;
775 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
776 DEBUGP("ip_conntrack_in: related packet for %p\n",
777 ct);
778 *ctinfo = IP_CT_RELATED;
779 } else {
780 DEBUGP("ip_conntrack_in: new packet for %p\n",
781 ct);
782 *ctinfo = IP_CT_NEW;
783 }
784 *set_reply = 0;
785 }
786 skb->nfct = &ct->ct_general;
787 skb->nfctinfo = *ctinfo;
788 return ct;
789}
790
791/* Netfilter hook itself. */
792unsigned int ip_conntrack_in(unsigned int hooknum,
793 struct sk_buff **pskb,
794 const struct net_device *in,
795 const struct net_device *out,
796 int (*okfn)(struct sk_buff *))
797{
798 struct ip_conntrack *ct;
799 enum ip_conntrack_info ctinfo;
800 struct ip_conntrack_protocol *proto;
ac3247ba 801 int set_reply = 0;
1da177e4
LT
802 int ret;
803
804 /* Previously seen (loopback or untracked)? Ignore. */
805 if ((*pskb)->nfct) {
806 CONNTRACK_STAT_INC(ignore);
807 return NF_ACCEPT;
808 }
809
810 /* Never happen */
811 if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
812 if (net_ratelimit()) {
813 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
814 (*pskb)->nh.iph->protocol, hooknum);
815 }
816 return NF_DROP;
817 }
818
1da177e4
LT
819/* Doesn't cover locally-generated broadcast, so not worth it. */
820#if 0
821 /* Ignore broadcast: no `connection'. */
822 if ((*pskb)->pkt_type == PACKET_BROADCAST) {
823 printk("Broadcast packet!\n");
824 return NF_ACCEPT;
825 } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
826 == htonl(0x000000FF)) {
827 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
828 NIPQUAD((*pskb)->nh.iph->saddr),
829 NIPQUAD((*pskb)->nh.iph->daddr),
830 (*pskb)->sk, (*pskb)->pkt_type);
831 }
832#endif
833
080774a2 834 proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
1da177e4
LT
835
836 /* It may be an special packet, error, unclean...
837 * inverse of the return code tells to the netfilter
838 * core what to do with the packet. */
839 if (proto->error != NULL
840 && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
841 CONNTRACK_STAT_INC(error);
842 CONNTRACK_STAT_INC(invalid);
843 return -ret;
844 }
845
846 if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
847 /* Not valid part of a connection */
848 CONNTRACK_STAT_INC(invalid);
849 return NF_ACCEPT;
850 }
851
852 if (IS_ERR(ct)) {
853 /* Too stressed to deal. */
854 CONNTRACK_STAT_INC(drop);
855 return NF_DROP;
856 }
857
858 IP_NF_ASSERT((*pskb)->nfct);
859
860 ret = proto->packet(ct, *pskb, ctinfo);
861 if (ret < 0) {
862 /* Invalid: inverse of the return code tells
863 * the netfilter core what to do*/
864 nf_conntrack_put((*pskb)->nfct);
865 (*pskb)->nfct = NULL;
866 CONNTRACK_STAT_INC(invalid);
867 return -ret;
868 }
869
ac3247ba
HW
870 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
871 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
1da177e4
LT
872
873 return ret;
874}
875
876int invert_tuplepr(struct ip_conntrack_tuple *inverse,
877 const struct ip_conntrack_tuple *orig)
878{
879 return ip_ct_invert_tuple(inverse, orig,
080774a2 880 __ip_conntrack_proto_find(orig->dst.protonum));
1da177e4
LT
881}
882
883/* Would two expected things clash? */
884static inline int expect_clash(const struct ip_conntrack_expect *a,
885 const struct ip_conntrack_expect *b)
886{
887 /* Part covered by intersection of masks must be unequal,
888 otherwise they clash */
889 struct ip_conntrack_tuple intersect_mask
890 = { { a->mask.src.ip & b->mask.src.ip,
891 { a->mask.src.u.all & b->mask.src.u.all } },
892 { a->mask.dst.ip & b->mask.dst.ip,
893 { a->mask.dst.u.all & b->mask.dst.u.all },
894 a->mask.dst.protonum & b->mask.dst.protonum } };
895
896 return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
897}
898
899static inline int expect_matches(const struct ip_conntrack_expect *a,
900 const struct ip_conntrack_expect *b)
901{
902 return a->master == b->master
903 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
904 && ip_ct_tuple_equal(&a->mask, &b->mask);
905}
906
907/* Generally a bad idea to call this: could have matched already. */
908void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
909{
910 struct ip_conntrack_expect *i;
911
e45b1be8 912 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
913 /* choose the the oldest expectation to evict */
914 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
915 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
49719eb3 916 ip_ct_unlink_expect(i);
e45b1be8 917 write_unlock_bh(&ip_conntrack_lock);
4acdbdbe 918 ip_conntrack_expect_put(i);
1da177e4
LT
919 return;
920 }
921 }
e45b1be8 922 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
923}
924
91c46e2e
PNA
925/* We don't increase the master conntrack refcount for non-fulfilled
926 * conntracks. During the conntrack destruction, the expectations are
927 * always killed before the conntrack itself */
4acdbdbe 928struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
1da177e4
LT
929{
930 struct ip_conntrack_expect *new;
931
932 new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
933 if (!new) {
934 DEBUGP("expect_related: OOM allocating expect\n");
935 return NULL;
936 }
4acdbdbe 937 new->master = me;
4acdbdbe 938 atomic_set(&new->use, 1);
1da177e4
LT
939 return new;
940}
941
4acdbdbe 942void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
1da177e4 943{
91c46e2e 944 if (atomic_dec_and_test(&exp->use))
4acdbdbe 945 kmem_cache_free(ip_conntrack_expect_cachep, exp);
1da177e4
LT
946}
947
948static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
949{
4acdbdbe 950 atomic_inc(&exp->use);
1da177e4
LT
951 exp->master->expecting++;
952 list_add(&exp->list, &ip_conntrack_expect_list);
953
1d3cdb41
PO
954 init_timer(&exp->timeout);
955 exp->timeout.data = (unsigned long)exp;
956 exp->timeout.function = expectation_timed_out;
957 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
958 add_timer(&exp->timeout);
1da177e4 959
080774a2
HW
960 exp->id = ++ip_conntrack_expect_next_id;
961 atomic_inc(&exp->use);
1da177e4
LT
962 CONNTRACK_STAT_INC(expect_create);
963}
964
965/* Race with expectations being used means we could have none to find; OK. */
966static void evict_oldest_expect(struct ip_conntrack *master)
967{
968 struct ip_conntrack_expect *i;
969
970 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
971 if (i->master == master) {
972 if (del_timer(&i->timeout)) {
49719eb3 973 ip_ct_unlink_expect(i);
4acdbdbe 974 ip_conntrack_expect_put(i);
1da177e4
LT
975 }
976 break;
977 }
978 }
979}
980
981static inline int refresh_timer(struct ip_conntrack_expect *i)
982{
983 if (!del_timer(&i->timeout))
984 return 0;
985
986 i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
987 add_timer(&i->timeout);
988 return 1;
989}
990
991int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
992{
993 struct ip_conntrack_expect *i;
994 int ret;
995
996 DEBUGP("ip_conntrack_expect_related %p\n", related_to);
997 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
998 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
999
e45b1be8 1000 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
1001 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1002 if (expect_matches(i, expect)) {
1003 /* Refresh timer: if it's dying, ignore.. */
1004 if (refresh_timer(i)) {
1005 ret = 0;
1da177e4
LT
1006 goto out;
1007 }
1008 } else if (expect_clash(i, expect)) {
1009 ret = -EBUSY;
1010 goto out;
1011 }
1012 }
1013
1014 /* Will be over limit? */
1015 if (expect->master->helper->max_expected &&
1016 expect->master->expecting >= expect->master->helper->max_expected)
1017 evict_oldest_expect(expect->master);
1018
1019 ip_conntrack_expect_insert(expect);
ac3247ba 1020 ip_conntrack_expect_event(IPEXP_NEW, expect);
1da177e4
LT
1021 ret = 0;
1022out:
e45b1be8 1023 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
1024 return ret;
1025}
1026
1027/* Alter reply tuple (maybe alter helper). This is for NAT, and is
1028 implicitly racy: see __ip_conntrack_confirm */
1029void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1030 const struct ip_conntrack_tuple *newreply)
1031{
e45b1be8 1032 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
1033 /* Should be unconfirmed, so not in hash table yet */
1034 IP_NF_ASSERT(!is_confirmed(conntrack));
1035
1036 DEBUGP("Altering reply tuple of %p to ", conntrack);
1037 DUMP_TUPLE(newreply);
1038
1039 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1040 if (!conntrack->master && conntrack->expecting == 0)
080774a2 1041 conntrack->helper = __ip_conntrack_helper_find(newreply);
e45b1be8 1042 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
1043}
1044
1045int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1046{
1047 BUG_ON(me->timeout == 0);
e45b1be8 1048 write_lock_bh(&ip_conntrack_lock);
df0933dc 1049 list_add(&me->list, &helpers);
e45b1be8 1050 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
1051
1052 return 0;
1053}
1054
080774a2
HW
1055struct ip_conntrack_helper *
1056__ip_conntrack_helper_find_byname(const char *name)
1057{
1058 struct ip_conntrack_helper *h;
1059
1060 list_for_each_entry(h, &helpers, list) {
1061 if (!strcmp(h->name, name))
1062 return h;
1063 }
1064
1065 return NULL;
1066}
1067
df0933dc
PM
1068static inline void unhelp(struct ip_conntrack_tuple_hash *i,
1069 const struct ip_conntrack_helper *me)
1da177e4 1070{
ac3247ba
HW
1071 if (tuplehash_to_ctrack(i)->helper == me) {
1072 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1da177e4 1073 tuplehash_to_ctrack(i)->helper = NULL;
ac3247ba 1074 }
1da177e4
LT
1075}
1076
1077void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1078{
1079 unsigned int i;
df0933dc 1080 struct ip_conntrack_tuple_hash *h;
1da177e4
LT
1081 struct ip_conntrack_expect *exp, *tmp;
1082
1083 /* Need write lock here, to delete helper. */
e45b1be8 1084 write_lock_bh(&ip_conntrack_lock);
df0933dc 1085 list_del(&me->list);
1da177e4
LT
1086
1087 /* Get rid of expectations */
1088 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1089 if (exp->master->helper == me && del_timer(&exp->timeout)) {
49719eb3 1090 ip_ct_unlink_expect(exp);
4acdbdbe 1091 ip_conntrack_expect_put(exp);
1da177e4
LT
1092 }
1093 }
1094 /* Get rid of expecteds, set helpers to NULL. */
df0933dc
PM
1095 list_for_each_entry(h, &unconfirmed, list)
1096 unhelp(h, me);
1097 for (i = 0; i < ip_conntrack_htable_size; i++) {
1098 list_for_each_entry(h, &ip_conntrack_hash[i], list)
1099 unhelp(h, me);
1100 }
e45b1be8 1101 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
1102
1103 /* Someone could be still looking at the helper in a bh. */
1104 synchronize_net();
1105}
1106
1dfbab59
HW
1107/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1108void __ip_ct_refresh_acct(struct ip_conntrack *ct,
1da177e4
LT
1109 enum ip_conntrack_info ctinfo,
1110 const struct sk_buff *skb,
1dfbab59
HW
1111 unsigned long extra_jiffies,
1112 int do_acct)
1da177e4 1113{
a051a8f7 1114 int event = 0;
1dfbab59 1115
1da177e4 1116 IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1dfbab59
HW
1117 IP_NF_ASSERT(skb);
1118
1119 write_lock_bh(&ip_conntrack_lock);
1da177e4 1120
997ae831
EL
1121 /* Only update if this is not a fixed timeout */
1122 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1123 write_unlock_bh(&ip_conntrack_lock);
1124 return;
1125 }
1126
1da177e4
LT
1127 /* If not in hash table, timer will not be active yet */
1128 if (!is_confirmed(ct)) {
1129 ct->timeout.expires = extra_jiffies;
a051a8f7 1130 event = IPCT_REFRESH;
1da177e4 1131 } else {
1da177e4
LT
1132 /* Need del_timer for race avoidance (may already be dying). */
1133 if (del_timer(&ct->timeout)) {
1134 ct->timeout.expires = jiffies + extra_jiffies;
1135 add_timer(&ct->timeout);
a051a8f7 1136 event = IPCT_REFRESH;
1da177e4 1137 }
1da177e4 1138 }
1dfbab59
HW
1139
1140#ifdef CONFIG_IP_NF_CT_ACCT
1141 if (do_acct) {
1142 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1143 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1144 ntohs(skb->nh.iph->tot_len);
a051a8f7
HW
1145 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1146 || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1147 event |= IPCT_COUNTER_FILLING;
1dfbab59
HW
1148 }
1149#endif
1150
1151 write_unlock_bh(&ip_conntrack_lock);
1152
1153 /* must be unlocked when calling event cache */
a051a8f7
HW
1154 if (event)
1155 ip_conntrack_event_cache(event, skb);
1da177e4
LT
1156}
1157
080774a2
HW
1158#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1159 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1160/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1161 * in ip_conntrack_core, since we don't want the protocols to autoload
1162 * or depend on ctnetlink */
1163int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1164 const struct ip_conntrack_tuple *tuple)
1165{
cdcb71bf 1166 NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(__be16),
080774a2 1167 &tuple->src.u.tcp.port);
cdcb71bf 1168 NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(__be16),
080774a2
HW
1169 &tuple->dst.u.tcp.port);
1170 return 0;
1171
1172nfattr_failure:
1173 return -1;
1174}
1175
1176int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1177 struct ip_conntrack_tuple *t)
1178{
1179 if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1180 return -EINVAL;
1181
1182 t->src.u.tcp.port =
cdcb71bf 1183 *(__be16 *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
080774a2 1184 t->dst.u.tcp.port =
cdcb71bf 1185 *(__be16 *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
080774a2
HW
1186
1187 return 0;
1188}
1189#endif
1190
1da177e4
LT
1191/* Returns new sk_buff, or NULL */
1192struct sk_buff *
1193ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1194{
8be58932 1195 skb_orphan(skb);
1da177e4
LT
1196
1197 local_bh_disable();
1198 skb = ip_defrag(skb, user);
1199 local_bh_enable();
1200
6869c4d8 1201 if (skb)
8be58932 1202 ip_send_check(skb->nh.iph);
1da177e4
LT
1203 return skb;
1204}
1205
1206/* Used by ipt_REJECT. */
1207static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1208{
1209 struct ip_conntrack *ct;
1210 enum ip_conntrack_info ctinfo;
1211
1212 /* This ICMP is in reverse direction to the packet which caused it */
1213 ct = ip_conntrack_get(skb, &ctinfo);
1214
1215 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1216 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1217 else
1218 ctinfo = IP_CT_RELATED;
1219
1220 /* Attach to new skbuff, and increment count */
1221 nskb->nfct = &ct->ct_general;
1222 nskb->nfctinfo = ctinfo;
1223 nf_conntrack_get(nskb->nfct);
1224}
1225
1da177e4 1226/* Bring out ya dead! */
df0933dc 1227static struct ip_conntrack *
1da177e4
LT
1228get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1229 void *data, unsigned int *bucket)
1230{
df0933dc
PM
1231 struct ip_conntrack_tuple_hash *h;
1232 struct ip_conntrack *ct;
1da177e4 1233
e45b1be8 1234 write_lock_bh(&ip_conntrack_lock);
1da177e4 1235 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
df0933dc
PM
1236 list_for_each_entry(h, &ip_conntrack_hash[*bucket], list) {
1237 ct = tuplehash_to_ctrack(h);
1238 if (iter(ct, data))
1239 goto found;
1240 }
1241 }
1242 list_for_each_entry(h, &unconfirmed, list) {
1243 ct = tuplehash_to_ctrack(h);
1244 if (iter(ct, data))
1245 goto found;
1da177e4 1246 }
e45b1be8 1247 write_unlock_bh(&ip_conntrack_lock);
df0933dc 1248 return NULL;
1da177e4 1249
df0933dc
PM
1250found:
1251 atomic_inc(&ct->ct_general.use);
1252 write_unlock_bh(&ip_conntrack_lock);
1253 return ct;
1da177e4
LT
1254}
1255
1256void
1257ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1258{
df0933dc 1259 struct ip_conntrack *ct;
1da177e4
LT
1260 unsigned int bucket = 0;
1261
df0933dc 1262 while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
1da177e4
LT
1263 /* Time to push up daises... */
1264 if (del_timer(&ct->timeout))
1265 death_by_timeout((unsigned long)ct);
1266 /* ... else the timer will get him soon. */
1267
1268 ip_conntrack_put(ct);
1269 }
1270}
1271
1272/* Fast function for those who don't want to parse /proc (and I don't
1273 blame them). */
1274/* Reversing the socket's dst/src point of view gives us the reply
1275 mapping. */
1276static int
1277getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1278{
1279 struct inet_sock *inet = inet_sk(sk);
1280 struct ip_conntrack_tuple_hash *h;
1281 struct ip_conntrack_tuple tuple;
1282
1283 IP_CT_TUPLE_U_BLANK(&tuple);
1284 tuple.src.ip = inet->rcv_saddr;
1285 tuple.src.u.tcp.port = inet->sport;
1286 tuple.dst.ip = inet->daddr;
1287 tuple.dst.u.tcp.port = inet->dport;
1288 tuple.dst.protonum = IPPROTO_TCP;
1289
1290 /* We only do TCP at the moment: is there a better way? */
1291 if (strcmp(sk->sk_prot->name, "TCP")) {
1292 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1293 return -ENOPROTOOPT;
1294 }
1295
1296 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1297 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1298 *len, sizeof(struct sockaddr_in));
1299 return -EINVAL;
1300 }
1301
1302 h = ip_conntrack_find_get(&tuple, NULL);
1303 if (h) {
1304 struct sockaddr_in sin;
1305 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1306
1307 sin.sin_family = AF_INET;
1308 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1309 .tuple.dst.u.tcp.port;
1310 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1311 .tuple.dst.ip;
6c813c3f 1312 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
1da177e4
LT
1313
1314 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1315 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1316 ip_conntrack_put(ct);
1317 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1318 return -EFAULT;
1319 else
1320 return 0;
1321 }
1322 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1323 NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1324 NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1325 return -ENOENT;
1326}
1327
1328static struct nf_sockopt_ops so_getorigdst = {
1329 .pf = PF_INET,
1330 .get_optmin = SO_ORIGINAL_DST,
1331 .get_optmax = SO_ORIGINAL_DST+1,
1332 .get = &getorigdst,
1333};
1334
1335static int kill_all(struct ip_conntrack *i, void *data)
1336{
1337 return 1;
1338}
1339
afe5c6bb
PM
1340void ip_conntrack_flush(void)
1341{
1342 ip_ct_iterate_cleanup(kill_all, NULL);
1343}
1344
eed75f19 1345static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
1da177e4 1346{
eed75f19
HW
1347 if (vmalloced)
1348 vfree(hash);
1da177e4 1349 else
eed75f19
HW
1350 free_pages((unsigned long)hash,
1351 get_order(sizeof(struct list_head) * size));
1da177e4
LT
1352}
1353
afe5c6bb
PM
1354/* Mishearing the voices in his head, our hero wonders how he's
1355 supposed to kill the mall. */
1356void ip_conntrack_cleanup(void)
1da177e4 1357{
afe5c6bb
PM
1358 ip_ct_attach = NULL;
1359
1da177e4
LT
1360 /* This makes sure all current packets have passed through
1361 netfilter framework. Roll on, two-stage module
1362 delete... */
1363 synchronize_net();
080774a2 1364
a86888b9 1365 ip_ct_event_cache_flush();
1da177e4 1366 i_see_dead_people:
afe5c6bb 1367 ip_conntrack_flush();
1da177e4
LT
1368 if (atomic_read(&ip_conntrack_count) != 0) {
1369 schedule();
1370 goto i_see_dead_people;
1371 }
21f930e4
PM
1372 /* wait until all references to ip_conntrack_untracked are dropped */
1373 while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1374 schedule();
1da177e4
LT
1375
1376 kmem_cache_destroy(ip_conntrack_cachep);
1377 kmem_cache_destroy(ip_conntrack_expect_cachep);
eed75f19
HW
1378 free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1379 ip_conntrack_htable_size);
1da177e4
LT
1380 nf_unregister_sockopt(&so_getorigdst);
1381}
1382
eed75f19
HW
1383static struct list_head *alloc_hashtable(int size, int *vmalloced)
1384{
1385 struct list_head *hash;
1386 unsigned int i;
1387
1388 *vmalloced = 0;
1389 hash = (void*)__get_free_pages(GFP_KERNEL,
1390 get_order(sizeof(struct list_head)
1391 * size));
1392 if (!hash) {
1393 *vmalloced = 1;
1394 printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
1395 hash = vmalloc(sizeof(struct list_head) * size);
1396 }
1397
1398 if (hash)
1399 for (i = 0; i < size; i++)
1400 INIT_LIST_HEAD(&hash[i]);
1401
1402 return hash;
1403}
1404
d127e94a 1405static int set_hashsize(const char *val, struct kernel_param *kp)
eed75f19
HW
1406{
1407 int i, bucket, hashsize, vmalloced;
1408 int old_vmalloced, old_size;
1409 int rnd;
1410 struct list_head *hash, *old_hash;
1411 struct ip_conntrack_tuple_hash *h;
1412
1413 /* On boot, we can set this without any fancy locking. */
1414 if (!ip_conntrack_htable_size)
1415 return param_set_int(val, kp);
1416
1417 hashsize = simple_strtol(val, NULL, 0);
1418 if (!hashsize)
1419 return -EINVAL;
1420
1421 hash = alloc_hashtable(hashsize, &vmalloced);
1422 if (!hash)
1423 return -ENOMEM;
1424
1425 /* We have to rehash for the new table anyway, so we also can
1426 * use a new random seed */
1427 get_random_bytes(&rnd, 4);
1428
1429 write_lock_bh(&ip_conntrack_lock);
1430 for (i = 0; i < ip_conntrack_htable_size; i++) {
1431 while (!list_empty(&ip_conntrack_hash[i])) {
1432 h = list_entry(ip_conntrack_hash[i].next,
1433 struct ip_conntrack_tuple_hash, list);
1434 list_del(&h->list);
1435 bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1436 list_add_tail(&h->list, &hash[bucket]);
1437 }
1438 }
1439 old_size = ip_conntrack_htable_size;
1440 old_vmalloced = ip_conntrack_vmalloc;
1441 old_hash = ip_conntrack_hash;
1442
1443 ip_conntrack_htable_size = hashsize;
1444 ip_conntrack_vmalloc = vmalloced;
1445 ip_conntrack_hash = hash;
1446 ip_conntrack_hash_rnd = rnd;
1447 write_unlock_bh(&ip_conntrack_lock);
1448
1449 free_conntrack_hash(old_hash, old_vmalloced, old_size);
1450 return 0;
1451}
1452
1453module_param_call(hashsize, set_hashsize, param_get_uint,
1454 &ip_conntrack_htable_size, 0600);
1da177e4
LT
1455
1456int __init ip_conntrack_init(void)
1457{
1458 unsigned int i;
1459 int ret;
1460
1461 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1462 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
eed75f19 1463 if (!ip_conntrack_htable_size) {
1da177e4
LT
1464 ip_conntrack_htable_size
1465 = (((num_physpages << PAGE_SHIFT) / 16384)
1466 / sizeof(struct list_head));
1467 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1468 ip_conntrack_htable_size = 8192;
1469 if (ip_conntrack_htable_size < 16)
1470 ip_conntrack_htable_size = 16;
1471 }
1472 ip_conntrack_max = 8 * ip_conntrack_htable_size;
1473
1474 printk("ip_conntrack version %s (%u buckets, %d max)"
1475 " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1476 ip_conntrack_htable_size, ip_conntrack_max,
1477 sizeof(struct ip_conntrack));
1478
1479 ret = nf_register_sockopt(&so_getorigdst);
1480 if (ret != 0) {
1481 printk(KERN_ERR "Unable to register netfilter socket option\n");
1482 return ret;
1483 }
1484
eed75f19
HW
1485 ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
1486 &ip_conntrack_vmalloc);
1da177e4
LT
1487 if (!ip_conntrack_hash) {
1488 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1489 goto err_unreg_sockopt;
1490 }
1491
1492 ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1493 sizeof(struct ip_conntrack), 0,
1494 0, NULL, NULL);
1495 if (!ip_conntrack_cachep) {
1496 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1497 goto err_free_hash;
1498 }
1499
1500 ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1501 sizeof(struct ip_conntrack_expect),
1502 0, 0, NULL, NULL);
1503 if (!ip_conntrack_expect_cachep) {
1504 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1505 goto err_free_conntrack_slab;
1506 }
1507
1508 /* Don't NEED lock here, but good form anyway. */
e45b1be8 1509 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
1510 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1511 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1512 /* Sew in builtin protocols. */
1513 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1514 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1515 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
e45b1be8 1516 write_unlock_bh(&ip_conntrack_lock);
1da177e4 1517
1da177e4
LT
1518 /* For use by ipt_REJECT */
1519 ip_ct_attach = ip_conntrack_attach;
1520
1521 /* Set up fake conntrack:
1522 - to never be deleted, not in any hashes */
1523 atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1524 /* - and look it like as a confirmed connection */
1525 set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1526
1527 return ret;
1528
1529err_free_conntrack_slab:
1530 kmem_cache_destroy(ip_conntrack_cachep);
1531err_free_hash:
eed75f19
HW
1532 free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1533 ip_conntrack_htable_size);
1da177e4
LT
1534err_unreg_sockopt:
1535 nf_unregister_sockopt(&so_getorigdst);
1536
1537 return -ENOMEM;
1538}