]> git.proxmox.com Git - mirror_ubuntu-eoan-kernel.git/blame - net/netfilter/nf_conntrack_core.c
[NETFILTER]: conntrack: add sysctl to disable checksumming
[mirror_ubuntu-eoan-kernel.git] / net / netfilter / nf_conntrack_core.c
CommitLineData
9fb9cbb1
YK
1/* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
4
5/* (C) 1999-2001 Paul `Rusty' Russell
dc808fe2 6 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
9fb9cbb1
YK
7 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
14 * - new API and handling of conntrack/nat helpers
15 * - now capable of multiple expectations for one master
16 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
17 * - add usage/reference counts to ip_conntrack_expect
18 * - export ip_conntrack[_expect]_{find_get,put} functions
19 * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
20 * - generalize L3 protocol denendent part.
21 * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
22 * - add support various size of conntrack structures.
dc808fe2
HW
23 * 26 Jan 2006: Harald Welte <laforge@netfilter.org>
24 * - restructure nf_conn (introduce nf_conn_help)
25 * - redesign 'features' how they were originally intended
b9f78f9f
PNA
26 * 26 Feb 2006: Pablo Neira Ayuso <pablo@eurodev.net>
27 * - add support for L3 protocol module load on demand.
9fb9cbb1
YK
28 *
29 * Derived from net/ipv4/netfilter/ip_conntrack_core.c
30 */
31
32#include <linux/config.h>
33#include <linux/types.h>
34#include <linux/netfilter.h>
35#include <linux/module.h>
36#include <linux/skbuff.h>
37#include <linux/proc_fs.h>
38#include <linux/vmalloc.h>
39#include <linux/stddef.h>
40#include <linux/slab.h>
41#include <linux/random.h>
42#include <linux/jhash.h>
43#include <linux/err.h>
44#include <linux/percpu.h>
45#include <linux/moduleparam.h>
46#include <linux/notifier.h>
47#include <linux/kernel.h>
48#include <linux/netdevice.h>
49#include <linux/socket.h>
50
51/* This rwlock protects the main hash table, protocol/helper/expected
52 registrations, conntrack timers*/
53#define ASSERT_READ_LOCK(x)
54#define ASSERT_WRITE_LOCK(x)
55
56#include <net/netfilter/nf_conntrack.h>
57#include <net/netfilter/nf_conntrack_l3proto.h>
58#include <net/netfilter/nf_conntrack_protocol.h>
59#include <net/netfilter/nf_conntrack_helper.h>
60#include <net/netfilter/nf_conntrack_core.h>
61#include <linux/netfilter_ipv4/listhelp.h>
62
dc808fe2 63#define NF_CONNTRACK_VERSION "0.5.0"
9fb9cbb1
YK
64
65#if 0
66#define DEBUGP printk
67#else
68#define DEBUGP(format, args...)
69#endif
70
71DEFINE_RWLOCK(nf_conntrack_lock);
72
73/* nf_conntrack_standalone needs this */
74atomic_t nf_conntrack_count = ATOMIC_INIT(0);
75
76void (*nf_conntrack_destroyed)(struct nf_conn *conntrack) = NULL;
77LIST_HEAD(nf_conntrack_expect_list);
78struct nf_conntrack_protocol **nf_ct_protos[PF_MAX];
79struct nf_conntrack_l3proto *nf_ct_l3protos[PF_MAX];
80static LIST_HEAD(helpers);
81unsigned int nf_conntrack_htable_size = 0;
82int nf_conntrack_max;
83struct list_head *nf_conntrack_hash;
84static kmem_cache_t *nf_conntrack_expect_cachep;
85struct nf_conn nf_conntrack_untracked;
86unsigned int nf_ct_log_invalid;
87static LIST_HEAD(unconfirmed);
88static int nf_conntrack_vmalloc;
89
4e3882f7
PNA
90static unsigned int nf_conntrack_next_id;
91static unsigned int nf_conntrack_expect_next_id;
9fb9cbb1 92#ifdef CONFIG_NF_CONNTRACK_EVENTS
e041c683
AS
93ATOMIC_NOTIFIER_HEAD(nf_conntrack_chain);
94ATOMIC_NOTIFIER_HEAD(nf_conntrack_expect_chain);
9fb9cbb1
YK
95
96DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
97
98/* deliver cached events and clear cache entry - must be called with locally
99 * disabled softirqs */
100static inline void
101__nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
102{
103 DEBUGP("ecache: delivering events for %p\n", ecache->ct);
104 if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
105 && ecache->events)
e041c683 106 atomic_notifier_call_chain(&nf_conntrack_chain, ecache->events,
9fb9cbb1
YK
107 ecache->ct);
108
109 ecache->events = 0;
110 nf_ct_put(ecache->ct);
111 ecache->ct = NULL;
112}
113
114/* Deliver all cached events for a particular conntrack. This is called
115 * by code prior to async packet handling for freeing the skb */
116void nf_ct_deliver_cached_events(const struct nf_conn *ct)
117{
118 struct nf_conntrack_ecache *ecache;
119
120 local_bh_disable();
121 ecache = &__get_cpu_var(nf_conntrack_ecache);
122 if (ecache->ct == ct)
123 __nf_ct_deliver_cached_events(ecache);
124 local_bh_enable();
125}
126
127/* Deliver cached events for old pending events, if current conntrack != old */
128void __nf_ct_event_cache_init(struct nf_conn *ct)
129{
130 struct nf_conntrack_ecache *ecache;
131
132 /* take care of delivering potentially old events */
133 ecache = &__get_cpu_var(nf_conntrack_ecache);
134 BUG_ON(ecache->ct == ct);
135 if (ecache->ct)
136 __nf_ct_deliver_cached_events(ecache);
137 /* initialize for this conntrack/packet */
138 ecache->ct = ct;
139 nf_conntrack_get(&ct->ct_general);
140}
141
142/* flush the event cache - touches other CPU's data and must not be called
143 * while packets are still passing through the code */
144static void nf_ct_event_cache_flush(void)
145{
146 struct nf_conntrack_ecache *ecache;
147 int cpu;
148
6f912042 149 for_each_possible_cpu(cpu) {
9fb9cbb1
YK
150 ecache = &per_cpu(nf_conntrack_ecache, cpu);
151 if (ecache->ct)
152 nf_ct_put(ecache->ct);
153 }
154}
155#else
156static inline void nf_ct_event_cache_flush(void) {}
157#endif /* CONFIG_NF_CONNTRACK_EVENTS */
158
159DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
160EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat);
161
162/*
163 * This scheme offers various size of "struct nf_conn" dependent on
164 * features(helper, nat, ...)
165 */
166
167#define NF_CT_FEATURES_NAMELEN 256
168static struct {
169 /* name of slab cache. printed in /proc/slabinfo */
170 char *name;
171
172 /* size of slab cache */
173 size_t size;
174
175 /* slab cache pointer */
176 kmem_cache_t *cachep;
177
178 /* allocated slab cache + modules which uses this slab cache */
179 int use;
180
9fb9cbb1
YK
181} nf_ct_cache[NF_CT_F_NUM];
182
183/* protect members of nf_ct_cache except of "use" */
184DEFINE_RWLOCK(nf_ct_cache_lock);
185
186/* This avoids calling kmem_cache_create() with same name simultaneously */
57b47a53 187static DEFINE_MUTEX(nf_ct_cache_mutex);
9fb9cbb1
YK
188
189extern struct nf_conntrack_protocol nf_conntrack_generic_protocol;
190struct nf_conntrack_protocol *
c1d10adb 191__nf_ct_proto_find(u_int16_t l3proto, u_int8_t protocol)
9fb9cbb1 192{
ddc8d029 193 if (unlikely(l3proto >= AF_MAX || nf_ct_protos[l3proto] == NULL))
9fb9cbb1
YK
194 return &nf_conntrack_generic_protocol;
195
196 return nf_ct_protos[l3proto][protocol];
197}
198
c1d10adb
PNA
199/* this is guaranteed to always return a valid protocol helper, since
200 * it falls back to generic_protocol */
201struct nf_conntrack_protocol *
202nf_ct_proto_find_get(u_int16_t l3proto, u_int8_t protocol)
203{
204 struct nf_conntrack_protocol *p;
205
206 preempt_disable();
207 p = __nf_ct_proto_find(l3proto, protocol);
e1bbdebd
YK
208 if (!try_module_get(p->me))
209 p = &nf_conntrack_generic_protocol;
c1d10adb
PNA
210 preempt_enable();
211
212 return p;
213}
214
215void nf_ct_proto_put(struct nf_conntrack_protocol *p)
216{
217 module_put(p->me);
218}
219
220struct nf_conntrack_l3proto *
221nf_ct_l3proto_find_get(u_int16_t l3proto)
222{
223 struct nf_conntrack_l3proto *p;
224
225 preempt_disable();
226 p = __nf_ct_l3proto_find(l3proto);
e1bbdebd
YK
227 if (!try_module_get(p->me))
228 p = &nf_conntrack_generic_l3proto;
c1d10adb
PNA
229 preempt_enable();
230
231 return p;
232}
233
234void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p)
235{
236 module_put(p->me);
237}
238
b9f78f9f
PNA
239int
240nf_ct_l3proto_try_module_get(unsigned short l3proto)
241{
242 int ret;
243 struct nf_conntrack_l3proto *p;
244
245retry: p = nf_ct_l3proto_find_get(l3proto);
246 if (p == &nf_conntrack_generic_l3proto) {
247 ret = request_module("nf_conntrack-%d", l3proto);
248 if (!ret)
249 goto retry;
250
251 return -EPROTOTYPE;
252 }
253
254 return 0;
255}
256
257void nf_ct_l3proto_module_put(unsigned short l3proto)
258{
259 struct nf_conntrack_l3proto *p;
260
261 preempt_disable();
262 p = __nf_ct_l3proto_find(l3proto);
263 preempt_enable();
264
265 module_put(p->me);
266}
267
9fb9cbb1
YK
268static int nf_conntrack_hash_rnd_initted;
269static unsigned int nf_conntrack_hash_rnd;
270
271static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
272 unsigned int size, unsigned int rnd)
273{
274 unsigned int a, b;
275 a = jhash((void *)tuple->src.u3.all, sizeof(tuple->src.u3.all),
276 ((tuple->src.l3num) << 16) | tuple->dst.protonum);
277 b = jhash((void *)tuple->dst.u3.all, sizeof(tuple->dst.u3.all),
278 (tuple->src.u.all << 16) | tuple->dst.u.all);
279
280 return jhash_2words(a, b, rnd) % size;
281}
282
283static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple)
284{
285 return __hash_conntrack(tuple, nf_conntrack_htable_size,
286 nf_conntrack_hash_rnd);
287}
288
9fb9cbb1 289int nf_conntrack_register_cache(u_int32_t features, const char *name,
dc808fe2 290 size_t size)
9fb9cbb1
YK
291{
292 int ret = 0;
293 char *cache_name;
294 kmem_cache_t *cachep;
295
296 DEBUGP("nf_conntrack_register_cache: features=0x%x, name=%s, size=%d\n",
297 features, name, size);
298
299 if (features < NF_CT_F_BASIC || features >= NF_CT_F_NUM) {
300 DEBUGP("nf_conntrack_register_cache: invalid features.: 0x%x\n",
301 features);
302 return -EINVAL;
303 }
304
57b47a53 305 mutex_lock(&nf_ct_cache_mutex);
9fb9cbb1
YK
306
307 write_lock_bh(&nf_ct_cache_lock);
308 /* e.g: multiple helpers are loaded */
309 if (nf_ct_cache[features].use > 0) {
310 DEBUGP("nf_conntrack_register_cache: already resisterd.\n");
311 if ((!strncmp(nf_ct_cache[features].name, name,
312 NF_CT_FEATURES_NAMELEN))
dc808fe2 313 && nf_ct_cache[features].size == size) {
9fb9cbb1
YK
314 DEBUGP("nf_conntrack_register_cache: reusing.\n");
315 nf_ct_cache[features].use++;
316 ret = 0;
317 } else
318 ret = -EBUSY;
319
320 write_unlock_bh(&nf_ct_cache_lock);
57b47a53 321 mutex_unlock(&nf_ct_cache_mutex);
9fb9cbb1
YK
322 return ret;
323 }
324 write_unlock_bh(&nf_ct_cache_lock);
325
326 /*
327 * The memory space for name of slab cache must be alive until
328 * cache is destroyed.
329 */
330 cache_name = kmalloc(sizeof(char)*NF_CT_FEATURES_NAMELEN, GFP_ATOMIC);
331 if (cache_name == NULL) {
332 DEBUGP("nf_conntrack_register_cache: can't alloc cache_name\n");
333 ret = -ENOMEM;
334 goto out_up_mutex;
335 }
336
337 if (strlcpy(cache_name, name, NF_CT_FEATURES_NAMELEN)
338 >= NF_CT_FEATURES_NAMELEN) {
339 printk("nf_conntrack_register_cache: name too long\n");
340 ret = -EINVAL;
341 goto out_free_name;
342 }
343
344 cachep = kmem_cache_create(cache_name, size, 0, 0,
345 NULL, NULL);
346 if (!cachep) {
347 printk("nf_conntrack_register_cache: Can't create slab cache "
348 "for the features = 0x%x\n", features);
349 ret = -ENOMEM;
350 goto out_free_name;
351 }
352
353 write_lock_bh(&nf_ct_cache_lock);
354 nf_ct_cache[features].use = 1;
355 nf_ct_cache[features].size = size;
9fb9cbb1
YK
356 nf_ct_cache[features].cachep = cachep;
357 nf_ct_cache[features].name = cache_name;
358 write_unlock_bh(&nf_ct_cache_lock);
359
360 goto out_up_mutex;
361
362out_free_name:
363 kfree(cache_name);
364out_up_mutex:
57b47a53 365 mutex_unlock(&nf_ct_cache_mutex);
9fb9cbb1
YK
366 return ret;
367}
368
369/* FIXME: In the current, only nf_conntrack_cleanup() can call this function. */
370void nf_conntrack_unregister_cache(u_int32_t features)
371{
372 kmem_cache_t *cachep;
373 char *name;
374
375 /*
376 * This assures that kmem_cache_create() isn't called before destroying
377 * slab cache.
378 */
379 DEBUGP("nf_conntrack_unregister_cache: 0x%04x\n", features);
57b47a53 380 mutex_lock(&nf_ct_cache_mutex);
9fb9cbb1
YK
381
382 write_lock_bh(&nf_ct_cache_lock);
383 if (--nf_ct_cache[features].use > 0) {
384 write_unlock_bh(&nf_ct_cache_lock);
57b47a53 385 mutex_unlock(&nf_ct_cache_mutex);
9fb9cbb1
YK
386 return;
387 }
388 cachep = nf_ct_cache[features].cachep;
389 name = nf_ct_cache[features].name;
390 nf_ct_cache[features].cachep = NULL;
391 nf_ct_cache[features].name = NULL;
9fb9cbb1
YK
392 nf_ct_cache[features].size = 0;
393 write_unlock_bh(&nf_ct_cache_lock);
394
395 synchronize_net();
396
397 kmem_cache_destroy(cachep);
398 kfree(name);
399
57b47a53 400 mutex_unlock(&nf_ct_cache_mutex);
9fb9cbb1
YK
401}
402
403int
404nf_ct_get_tuple(const struct sk_buff *skb,
405 unsigned int nhoff,
406 unsigned int dataoff,
407 u_int16_t l3num,
408 u_int8_t protonum,
409 struct nf_conntrack_tuple *tuple,
410 const struct nf_conntrack_l3proto *l3proto,
411 const struct nf_conntrack_protocol *protocol)
412{
413 NF_CT_TUPLE_U_BLANK(tuple);
414
415 tuple->src.l3num = l3num;
416 if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
417 return 0;
418
419 tuple->dst.protonum = protonum;
420 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
421
422 return protocol->pkt_to_tuple(skb, dataoff, tuple);
423}
424
425int
426nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
427 const struct nf_conntrack_tuple *orig,
428 const struct nf_conntrack_l3proto *l3proto,
429 const struct nf_conntrack_protocol *protocol)
430{
431 NF_CT_TUPLE_U_BLANK(inverse);
432
433 inverse->src.l3num = orig->src.l3num;
434 if (l3proto->invert_tuple(inverse, orig) == 0)
435 return 0;
436
437 inverse->dst.dir = !orig->dst.dir;
438
439 inverse->dst.protonum = orig->dst.protonum;
440 return protocol->invert_tuple(inverse, orig);
441}
442
443/* nf_conntrack_expect helper functions */
c1d10adb 444void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
9fb9cbb1 445{
dc808fe2
HW
446 struct nf_conn_help *master_help = nfct_help(exp->master);
447
448 NF_CT_ASSERT(master_help);
9fb9cbb1 449 ASSERT_WRITE_LOCK(&nf_conntrack_lock);
4a59a810 450 NF_CT_ASSERT(!timer_pending(&exp->timeout));
dc808fe2 451
9fb9cbb1
YK
452 list_del(&exp->list);
453 NF_CT_STAT_INC(expect_delete);
dc808fe2 454 master_help->expecting--;
9fb9cbb1
YK
455 nf_conntrack_expect_put(exp);
456}
457
458static void expectation_timed_out(unsigned long ul_expect)
459{
460 struct nf_conntrack_expect *exp = (void *)ul_expect;
461
462 write_lock_bh(&nf_conntrack_lock);
463 nf_ct_unlink_expect(exp);
464 write_unlock_bh(&nf_conntrack_lock);
465 nf_conntrack_expect_put(exp);
466}
467
c1d10adb
PNA
468struct nf_conntrack_expect *
469__nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple)
470{
471 struct nf_conntrack_expect *i;
472
473 list_for_each_entry(i, &nf_conntrack_expect_list, list) {
474 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
475 atomic_inc(&i->use);
476 return i;
477 }
478 }
479 return NULL;
480}
481
482/* Just find a expectation corresponding to a tuple. */
483struct nf_conntrack_expect *
484nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple)
485{
486 struct nf_conntrack_expect *i;
487
488 read_lock_bh(&nf_conntrack_lock);
489 i = __nf_conntrack_expect_find(tuple);
490 read_unlock_bh(&nf_conntrack_lock);
491
492 return i;
493}
494
9fb9cbb1
YK
495/* If an expectation for this connection is found, it gets delete from
496 * global list then returned. */
497static struct nf_conntrack_expect *
498find_expectation(const struct nf_conntrack_tuple *tuple)
499{
500 struct nf_conntrack_expect *i;
501
502 list_for_each_entry(i, &nf_conntrack_expect_list, list) {
503 /* If master is not in hash table yet (ie. packet hasn't left
504 this machine yet), how can other end know about expected?
505 Hence these are not the droids you are looking for (if
506 master ct never got confirmed, we'd hold a reference to it
507 and weird things would happen to future packets). */
508 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
509 && nf_ct_is_confirmed(i->master)) {
510 if (i->flags & NF_CT_EXPECT_PERMANENT) {
511 atomic_inc(&i->use);
512 return i;
513 } else if (del_timer(&i->timeout)) {
514 nf_ct_unlink_expect(i);
515 return i;
516 }
517 }
518 }
519 return NULL;
520}
521
522/* delete all expectations for this conntrack */
c1d10adb 523void nf_ct_remove_expectations(struct nf_conn *ct)
9fb9cbb1
YK
524{
525 struct nf_conntrack_expect *i, *tmp;
dc808fe2 526 struct nf_conn_help *help = nfct_help(ct);
9fb9cbb1
YK
527
528 /* Optimization: most connection never expect any others. */
dc808fe2 529 if (!help || help->expecting == 0)
9fb9cbb1
YK
530 return;
531
532 list_for_each_entry_safe(i, tmp, &nf_conntrack_expect_list, list) {
533 if (i->master == ct && del_timer(&i->timeout)) {
534 nf_ct_unlink_expect(i);
535 nf_conntrack_expect_put(i);
536 }
537 }
538}
539
540static void
541clean_from_lists(struct nf_conn *ct)
542{
543 unsigned int ho, hr;
544
545 DEBUGP("clean_from_lists(%p)\n", ct);
546 ASSERT_WRITE_LOCK(&nf_conntrack_lock);
547
548 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
549 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
550 LIST_DELETE(&nf_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
551 LIST_DELETE(&nf_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
552
553 /* Destroy all pending expectations */
c1d10adb 554 nf_ct_remove_expectations(ct);
9fb9cbb1
YK
555}
556
557static void
558destroy_conntrack(struct nf_conntrack *nfct)
559{
560 struct nf_conn *ct = (struct nf_conn *)nfct;
561 struct nf_conntrack_l3proto *l3proto;
562 struct nf_conntrack_protocol *proto;
563
564 DEBUGP("destroy_conntrack(%p)\n", ct);
565 NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
566 NF_CT_ASSERT(!timer_pending(&ct->timeout));
567
568 nf_conntrack_event(IPCT_DESTROY, ct);
569 set_bit(IPS_DYING_BIT, &ct->status);
570
571 /* To make sure we don't get any weird locking issues here:
572 * destroy_conntrack() MUST NOT be called with a write lock
573 * to nf_conntrack_lock!!! -HW */
c1d10adb 574 l3proto = __nf_ct_l3proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num);
9fb9cbb1
YK
575 if (l3proto && l3proto->destroy)
576 l3proto->destroy(ct);
577
c1d10adb 578 proto = __nf_ct_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num, ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
9fb9cbb1
YK
579 if (proto && proto->destroy)
580 proto->destroy(ct);
581
582 if (nf_conntrack_destroyed)
583 nf_conntrack_destroyed(ct);
584
585 write_lock_bh(&nf_conntrack_lock);
586 /* Expectations will have been removed in clean_from_lists,
587 * except TFTP can create an expectation on the first packet,
588 * before connection is in the list, so we need to clean here,
589 * too. */
c1d10adb 590 nf_ct_remove_expectations(ct);
9fb9cbb1
YK
591
592 /* We overload first tuple to link into unconfirmed list. */
593 if (!nf_ct_is_confirmed(ct)) {
594 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
595 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
596 }
597
598 NF_CT_STAT_INC(delete);
599 write_unlock_bh(&nf_conntrack_lock);
600
601 if (ct->master)
602 nf_ct_put(ct->master);
603
604 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
605 nf_conntrack_free(ct);
606}
607
608static void death_by_timeout(unsigned long ul_conntrack)
609{
610 struct nf_conn *ct = (void *)ul_conntrack;
611
612 write_lock_bh(&nf_conntrack_lock);
613 /* Inside lock so preempt is disabled on module removal path.
614 * Otherwise we can get spurious warnings. */
615 NF_CT_STAT_INC(delete_list);
616 clean_from_lists(ct);
617 write_unlock_bh(&nf_conntrack_lock);
618 nf_ct_put(ct);
619}
620
621static inline int
622conntrack_tuple_cmp(const struct nf_conntrack_tuple_hash *i,
623 const struct nf_conntrack_tuple *tuple,
624 const struct nf_conn *ignored_conntrack)
625{
626 ASSERT_READ_LOCK(&nf_conntrack_lock);
627 return nf_ct_tuplehash_to_ctrack(i) != ignored_conntrack
628 && nf_ct_tuple_equal(tuple, &i->tuple);
629}
630
c1d10adb 631struct nf_conntrack_tuple_hash *
9fb9cbb1
YK
632__nf_conntrack_find(const struct nf_conntrack_tuple *tuple,
633 const struct nf_conn *ignored_conntrack)
634{
635 struct nf_conntrack_tuple_hash *h;
636 unsigned int hash = hash_conntrack(tuple);
637
638 ASSERT_READ_LOCK(&nf_conntrack_lock);
639 list_for_each_entry(h, &nf_conntrack_hash[hash], list) {
640 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
641 NF_CT_STAT_INC(found);
642 return h;
643 }
644 NF_CT_STAT_INC(searched);
645 }
646
647 return NULL;
648}
649
650/* Find a connection corresponding to a tuple. */
651struct nf_conntrack_tuple_hash *
652nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple,
653 const struct nf_conn *ignored_conntrack)
654{
655 struct nf_conntrack_tuple_hash *h;
656
657 read_lock_bh(&nf_conntrack_lock);
658 h = __nf_conntrack_find(tuple, ignored_conntrack);
659 if (h)
660 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
661 read_unlock_bh(&nf_conntrack_lock);
662
663 return h;
664}
665
c1d10adb
PNA
666static void __nf_conntrack_hash_insert(struct nf_conn *ct,
667 unsigned int hash,
668 unsigned int repl_hash)
669{
670 ct->id = ++nf_conntrack_next_id;
671 list_prepend(&nf_conntrack_hash[hash],
672 &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
673 list_prepend(&nf_conntrack_hash[repl_hash],
674 &ct->tuplehash[IP_CT_DIR_REPLY].list);
675}
676
677void nf_conntrack_hash_insert(struct nf_conn *ct)
678{
679 unsigned int hash, repl_hash;
680
681 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
682 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
683
684 write_lock_bh(&nf_conntrack_lock);
685 __nf_conntrack_hash_insert(ct, hash, repl_hash);
686 write_unlock_bh(&nf_conntrack_lock);
687}
688
9fb9cbb1
YK
689/* Confirm a connection given skb; places it in hash table */
690int
691__nf_conntrack_confirm(struct sk_buff **pskb)
692{
693 unsigned int hash, repl_hash;
694 struct nf_conn *ct;
695 enum ip_conntrack_info ctinfo;
696
697 ct = nf_ct_get(*pskb, &ctinfo);
698
699 /* ipt_REJECT uses nf_conntrack_attach to attach related
700 ICMP/TCP RST packets in other direction. Actual packet
701 which created connection will be IP_CT_NEW or for an
702 expected connection, IP_CT_RELATED. */
703 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
704 return NF_ACCEPT;
705
706 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
707 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
708
709 /* We're not in hash table, and we refuse to set up related
710 connections for unconfirmed conns. But packet copies and
711 REJECT will give spurious warnings here. */
712 /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
713
714 /* No external references means noone else could have
715 confirmed us. */
716 NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
717 DEBUGP("Confirming conntrack %p\n", ct);
718
719 write_lock_bh(&nf_conntrack_lock);
720
721 /* See if there's one in the list already, including reverse:
722 NAT could have grabbed it without realizing, since we're
723 not in the hash. If there is, we lost race. */
724 if (!LIST_FIND(&nf_conntrack_hash[hash],
725 conntrack_tuple_cmp,
726 struct nf_conntrack_tuple_hash *,
727 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
728 && !LIST_FIND(&nf_conntrack_hash[repl_hash],
729 conntrack_tuple_cmp,
730 struct nf_conntrack_tuple_hash *,
731 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
dc808fe2 732 struct nf_conn_help *help;
9fb9cbb1
YK
733 /* Remove from unconfirmed list */
734 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
735
c1d10adb 736 __nf_conntrack_hash_insert(ct, hash, repl_hash);
9fb9cbb1
YK
737 /* Timer relative to confirmation time, not original
738 setting time, otherwise we'd get timer wrap in
739 weird delay cases. */
740 ct->timeout.expires += jiffies;
741 add_timer(&ct->timeout);
742 atomic_inc(&ct->ct_general.use);
743 set_bit(IPS_CONFIRMED_BIT, &ct->status);
744 NF_CT_STAT_INC(insert);
745 write_unlock_bh(&nf_conntrack_lock);
dc808fe2
HW
746 help = nfct_help(ct);
747 if (help && help->helper)
9fb9cbb1
YK
748 nf_conntrack_event_cache(IPCT_HELPER, *pskb);
749#ifdef CONFIG_NF_NAT_NEEDED
750 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
751 test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
752 nf_conntrack_event_cache(IPCT_NATINFO, *pskb);
753#endif
754 nf_conntrack_event_cache(master_ct(ct) ?
755 IPCT_RELATED : IPCT_NEW, *pskb);
756 return NF_ACCEPT;
757 }
758
759 NF_CT_STAT_INC(insert_failed);
760 write_unlock_bh(&nf_conntrack_lock);
761 return NF_DROP;
762}
763
764/* Returns true if a connection correspondings to the tuple (required
765 for NAT). */
766int
767nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
768 const struct nf_conn *ignored_conntrack)
769{
770 struct nf_conntrack_tuple_hash *h;
771
772 read_lock_bh(&nf_conntrack_lock);
773 h = __nf_conntrack_find(tuple, ignored_conntrack);
774 read_unlock_bh(&nf_conntrack_lock);
775
776 return h != NULL;
777}
778
779/* There's a small race here where we may free a just-assured
780 connection. Too bad: we're in trouble anyway. */
781static inline int unreplied(const struct nf_conntrack_tuple_hash *i)
782{
783 return !(test_bit(IPS_ASSURED_BIT,
784 &nf_ct_tuplehash_to_ctrack(i)->status));
785}
786
787static int early_drop(struct list_head *chain)
788{
789 /* Traverse backwards: gives us oldest, which is roughly LRU */
790 struct nf_conntrack_tuple_hash *h;
791 struct nf_conn *ct = NULL;
792 int dropped = 0;
793
794 read_lock_bh(&nf_conntrack_lock);
795 h = LIST_FIND_B(chain, unreplied, struct nf_conntrack_tuple_hash *);
796 if (h) {
797 ct = nf_ct_tuplehash_to_ctrack(h);
798 atomic_inc(&ct->ct_general.use);
799 }
800 read_unlock_bh(&nf_conntrack_lock);
801
802 if (!ct)
803 return dropped;
804
805 if (del_timer(&ct->timeout)) {
806 death_by_timeout((unsigned long)ct);
807 dropped = 1;
808 NF_CT_STAT_INC(early_drop);
809 }
810 nf_ct_put(ct);
811 return dropped;
812}
813
814static inline int helper_cmp(const struct nf_conntrack_helper *i,
815 const struct nf_conntrack_tuple *rtuple)
816{
817 return nf_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
818}
819
820static struct nf_conntrack_helper *
c1d10adb 821__nf_ct_helper_find(const struct nf_conntrack_tuple *tuple)
9fb9cbb1
YK
822{
823 return LIST_FIND(&helpers, helper_cmp,
824 struct nf_conntrack_helper *,
825 tuple);
826}
827
c1d10adb
PNA
828struct nf_conntrack_helper *
829nf_ct_helper_find_get( const struct nf_conntrack_tuple *tuple)
830{
831 struct nf_conntrack_helper *helper;
832
833 /* need nf_conntrack_lock to assure that helper exists until
834 * try_module_get() is called */
835 read_lock_bh(&nf_conntrack_lock);
836
837 helper = __nf_ct_helper_find(tuple);
838 if (helper) {
839 /* need to increase module usage count to assure helper will
840 * not go away while the caller is e.g. busy putting a
841 * conntrack in the hash that uses the helper */
842 if (!try_module_get(helper->me))
843 helper = NULL;
844 }
845
846 read_unlock_bh(&nf_conntrack_lock);
847
848 return helper;
849}
850
851void nf_ct_helper_put(struct nf_conntrack_helper *helper)
852{
853 module_put(helper->me);
854}
855
9fb9cbb1
YK
856static struct nf_conn *
857__nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
858 const struct nf_conntrack_tuple *repl,
859 const struct nf_conntrack_l3proto *l3proto)
860{
861 struct nf_conn *conntrack = NULL;
862 u_int32_t features = 0;
dc808fe2 863 struct nf_conntrack_helper *helper;
9fb9cbb1 864
dc808fe2 865 if (unlikely(!nf_conntrack_hash_rnd_initted)) {
9fb9cbb1
YK
866 get_random_bytes(&nf_conntrack_hash_rnd, 4);
867 nf_conntrack_hash_rnd_initted = 1;
868 }
869
870 if (nf_conntrack_max
871 && atomic_read(&nf_conntrack_count) >= nf_conntrack_max) {
872 unsigned int hash = hash_conntrack(orig);
873 /* Try dropping from this hash chain. */
874 if (!early_drop(&nf_conntrack_hash[hash])) {
875 if (net_ratelimit())
876 printk(KERN_WARNING
877 "nf_conntrack: table full, dropping"
878 " packet.\n");
879 return ERR_PTR(-ENOMEM);
880 }
881 }
882
883 /* find features needed by this conntrack. */
884 features = l3proto->get_features(orig);
dc808fe2
HW
885
886 /* FIXME: protect helper list per RCU */
9fb9cbb1 887 read_lock_bh(&nf_conntrack_lock);
dc808fe2
HW
888 helper = __nf_ct_helper_find(repl);
889 if (helper)
9fb9cbb1
YK
890 features |= NF_CT_F_HELP;
891 read_unlock_bh(&nf_conntrack_lock);
892
893 DEBUGP("nf_conntrack_alloc: features=0x%x\n", features);
894
895 read_lock_bh(&nf_ct_cache_lock);
896
dc808fe2 897 if (unlikely(!nf_ct_cache[features].use)) {
9fb9cbb1
YK
898 DEBUGP("nf_conntrack_alloc: not supported features = 0x%x\n",
899 features);
900 goto out;
901 }
902
903 conntrack = kmem_cache_alloc(nf_ct_cache[features].cachep, GFP_ATOMIC);
904 if (conntrack == NULL) {
905 DEBUGP("nf_conntrack_alloc: Can't alloc conntrack from cache\n");
906 goto out;
907 }
908
909 memset(conntrack, 0, nf_ct_cache[features].size);
910 conntrack->features = features;
dc808fe2
HW
911 if (helper) {
912 struct nf_conn_help *help = nfct_help(conntrack);
913 NF_CT_ASSERT(help);
914 help->helper = helper;
9fb9cbb1
YK
915 }
916
917 atomic_set(&conntrack->ct_general.use, 1);
918 conntrack->ct_general.destroy = destroy_conntrack;
919 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
920 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
921 /* Don't set timer yet: wait for confirmation */
922 init_timer(&conntrack->timeout);
923 conntrack->timeout.data = (unsigned long)conntrack;
924 conntrack->timeout.function = death_by_timeout;
925
926 atomic_inc(&nf_conntrack_count);
927out:
928 read_unlock_bh(&nf_ct_cache_lock);
929 return conntrack;
930}
931
932struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
933 const struct nf_conntrack_tuple *repl)
934{
935 struct nf_conntrack_l3proto *l3proto;
936
c1d10adb 937 l3proto = __nf_ct_l3proto_find(orig->src.l3num);
9fb9cbb1
YK
938 return __nf_conntrack_alloc(orig, repl, l3proto);
939}
940
941void nf_conntrack_free(struct nf_conn *conntrack)
942{
943 u_int32_t features = conntrack->features;
944 NF_CT_ASSERT(features >= NF_CT_F_BASIC && features < NF_CT_F_NUM);
945 DEBUGP("nf_conntrack_free: features = 0x%x, conntrack=%p\n", features,
946 conntrack);
947 kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
948 atomic_dec(&nf_conntrack_count);
949}
950
951/* Allocate a new conntrack: we return -ENOMEM if classification
952 failed due to stress. Otherwise it really is unclassifiable. */
953static struct nf_conntrack_tuple_hash *
954init_conntrack(const struct nf_conntrack_tuple *tuple,
955 struct nf_conntrack_l3proto *l3proto,
956 struct nf_conntrack_protocol *protocol,
957 struct sk_buff *skb,
958 unsigned int dataoff)
959{
960 struct nf_conn *conntrack;
961 struct nf_conntrack_tuple repl_tuple;
962 struct nf_conntrack_expect *exp;
963
964 if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, protocol)) {
965 DEBUGP("Can't invert tuple.\n");
966 return NULL;
967 }
968
969 conntrack = __nf_conntrack_alloc(tuple, &repl_tuple, l3proto);
970 if (conntrack == NULL || IS_ERR(conntrack)) {
971 DEBUGP("Can't allocate conntrack.\n");
972 return (struct nf_conntrack_tuple_hash *)conntrack;
973 }
974
975 if (!protocol->new(conntrack, skb, dataoff)) {
976 nf_conntrack_free(conntrack);
977 DEBUGP("init conntrack: can't track with proto module\n");
978 return NULL;
979 }
980
981 write_lock_bh(&nf_conntrack_lock);
982 exp = find_expectation(tuple);
983
984 if (exp) {
985 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
986 conntrack, exp);
987 /* Welcome, Mr. Bond. We've been expecting you... */
988 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
989 conntrack->master = exp->master;
990#ifdef CONFIG_NF_CONNTRACK_MARK
991 conntrack->mark = exp->master->mark;
992#endif
993 nf_conntrack_get(&conntrack->master->ct_general);
994 NF_CT_STAT_INC(expect_new);
dc808fe2 995 } else
9fb9cbb1 996 NF_CT_STAT_INC(new);
9fb9cbb1
YK
997
998 /* Overload tuple linked list to put us in unconfirmed list. */
999 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
1000
1001 write_unlock_bh(&nf_conntrack_lock);
1002
1003 if (exp) {
1004 if (exp->expectfn)
1005 exp->expectfn(conntrack, exp);
1006 nf_conntrack_expect_put(exp);
1007 }
1008
1009 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
1010}
1011
1012/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
1013static inline struct nf_conn *
1014resolve_normal_ct(struct sk_buff *skb,
1015 unsigned int dataoff,
1016 u_int16_t l3num,
1017 u_int8_t protonum,
1018 struct nf_conntrack_l3proto *l3proto,
1019 struct nf_conntrack_protocol *proto,
1020 int *set_reply,
1021 enum ip_conntrack_info *ctinfo)
1022{
1023 struct nf_conntrack_tuple tuple;
1024 struct nf_conntrack_tuple_hash *h;
1025 struct nf_conn *ct;
1026
1027 if (!nf_ct_get_tuple(skb, (unsigned int)(skb->nh.raw - skb->data),
1028 dataoff, l3num, protonum, &tuple, l3proto,
1029 proto)) {
1030 DEBUGP("resolve_normal_ct: Can't get tuple\n");
1031 return NULL;
1032 }
1033
1034 /* look for tuple match */
1035 h = nf_conntrack_find_get(&tuple, NULL);
1036 if (!h) {
1037 h = init_conntrack(&tuple, l3proto, proto, skb, dataoff);
1038 if (!h)
1039 return NULL;
1040 if (IS_ERR(h))
1041 return (void *)h;
1042 }
1043 ct = nf_ct_tuplehash_to_ctrack(h);
1044
1045 /* It exists; we have (non-exclusive) reference. */
1046 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
1047 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
1048 /* Please set reply bit if this packet OK */
1049 *set_reply = 1;
1050 } else {
1051 /* Once we've had two way comms, always ESTABLISHED. */
1052 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1053 DEBUGP("nf_conntrack_in: normal packet for %p\n", ct);
1054 *ctinfo = IP_CT_ESTABLISHED;
1055 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
1056 DEBUGP("nf_conntrack_in: related packet for %p\n", ct);
1057 *ctinfo = IP_CT_RELATED;
1058 } else {
1059 DEBUGP("nf_conntrack_in: new packet for %p\n", ct);
1060 *ctinfo = IP_CT_NEW;
1061 }
1062 *set_reply = 0;
1063 }
1064 skb->nfct = &ct->ct_general;
1065 skb->nfctinfo = *ctinfo;
1066 return ct;
1067}
1068
1069unsigned int
1070nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb)
1071{
1072 struct nf_conn *ct;
1073 enum ip_conntrack_info ctinfo;
1074 struct nf_conntrack_l3proto *l3proto;
1075 struct nf_conntrack_protocol *proto;
1076 unsigned int dataoff;
1077 u_int8_t protonum;
1078 int set_reply = 0;
1079 int ret;
1080
1081 /* Previously seen (loopback or untracked)? Ignore. */
1082 if ((*pskb)->nfct) {
1083 NF_CT_STAT_INC(ignore);
1084 return NF_ACCEPT;
1085 }
1086
c1d10adb 1087 l3proto = __nf_ct_l3proto_find((u_int16_t)pf);
9fb9cbb1
YK
1088 if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) {
1089 DEBUGP("not prepared to track yet or error occured\n");
1090 return -ret;
1091 }
1092
c1d10adb 1093 proto = __nf_ct_proto_find((u_int16_t)pf, protonum);
9fb9cbb1
YK
1094
1095 /* It may be an special packet, error, unclean...
1096 * inverse of the return code tells to the netfilter
1097 * core what to do with the packet. */
1098 if (proto->error != NULL &&
1099 (ret = proto->error(*pskb, dataoff, &ctinfo, pf, hooknum)) <= 0) {
1100 NF_CT_STAT_INC(error);
1101 NF_CT_STAT_INC(invalid);
1102 return -ret;
1103 }
1104
1105 ct = resolve_normal_ct(*pskb, dataoff, pf, protonum, l3proto, proto,
1106 &set_reply, &ctinfo);
1107 if (!ct) {
1108 /* Not valid part of a connection */
1109 NF_CT_STAT_INC(invalid);
1110 return NF_ACCEPT;
1111 }
1112
1113 if (IS_ERR(ct)) {
1114 /* Too stressed to deal. */
1115 NF_CT_STAT_INC(drop);
1116 return NF_DROP;
1117 }
1118
1119 NF_CT_ASSERT((*pskb)->nfct);
1120
1121 ret = proto->packet(ct, *pskb, dataoff, ctinfo, pf, hooknum);
1122 if (ret < 0) {
1123 /* Invalid: inverse of the return code tells
1124 * the netfilter core what to do */
1125 DEBUGP("nf_conntrack_in: Can't track with proto module\n");
1126 nf_conntrack_put((*pskb)->nfct);
1127 (*pskb)->nfct = NULL;
1128 NF_CT_STAT_INC(invalid);
1129 return -ret;
1130 }
1131
1132 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
1133 nf_conntrack_event_cache(IPCT_STATUS, *pskb);
1134
1135 return ret;
1136}
1137
1138int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
1139 const struct nf_conntrack_tuple *orig)
1140{
1141 return nf_ct_invert_tuple(inverse, orig,
c1d10adb
PNA
1142 __nf_ct_l3proto_find(orig->src.l3num),
1143 __nf_ct_proto_find(orig->src.l3num,
1144 orig->dst.protonum));
9fb9cbb1
YK
1145}
1146
1147/* Would two expected things clash? */
1148static inline int expect_clash(const struct nf_conntrack_expect *a,
1149 const struct nf_conntrack_expect *b)
1150{
1151 /* Part covered by intersection of masks must be unequal,
1152 otherwise they clash */
1153 struct nf_conntrack_tuple intersect_mask;
1154 int count;
1155
1156 intersect_mask.src.l3num = a->mask.src.l3num & b->mask.src.l3num;
1157 intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all;
1158 intersect_mask.dst.u.all = a->mask.dst.u.all & b->mask.dst.u.all;
1159 intersect_mask.dst.protonum = a->mask.dst.protonum
1160 & b->mask.dst.protonum;
1161
1162 for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1163 intersect_mask.src.u3.all[count] =
1164 a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
1165 }
1166
1167 for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1168 intersect_mask.dst.u3.all[count] =
1169 a->mask.dst.u3.all[count] & b->mask.dst.u3.all[count];
1170 }
1171
1172 return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
1173}
1174
1175static inline int expect_matches(const struct nf_conntrack_expect *a,
1176 const struct nf_conntrack_expect *b)
1177{
1178 return a->master == b->master
1179 && nf_ct_tuple_equal(&a->tuple, &b->tuple)
1180 && nf_ct_tuple_equal(&a->mask, &b->mask);
1181}
1182
1183/* Generally a bad idea to call this: could have matched already. */
1184void nf_conntrack_unexpect_related(struct nf_conntrack_expect *exp)
1185{
1186 struct nf_conntrack_expect *i;
1187
1188 write_lock_bh(&nf_conntrack_lock);
1189 /* choose the the oldest expectation to evict */
1190 list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1191 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
1192 nf_ct_unlink_expect(i);
1193 write_unlock_bh(&nf_conntrack_lock);
1194 nf_conntrack_expect_put(i);
1195 return;
1196 }
1197 }
1198 write_unlock_bh(&nf_conntrack_lock);
1199}
1200
1201/* We don't increase the master conntrack refcount for non-fulfilled
1202 * conntracks. During the conntrack destruction, the expectations are
1203 * always killed before the conntrack itself */
1204struct nf_conntrack_expect *nf_conntrack_expect_alloc(struct nf_conn *me)
1205{
1206 struct nf_conntrack_expect *new;
1207
1208 new = kmem_cache_alloc(nf_conntrack_expect_cachep, GFP_ATOMIC);
1209 if (!new) {
1210 DEBUGP("expect_related: OOM allocating expect\n");
1211 return NULL;
1212 }
1213 new->master = me;
1214 atomic_set(&new->use, 1);
1215 return new;
1216}
1217
1218void nf_conntrack_expect_put(struct nf_conntrack_expect *exp)
1219{
1220 if (atomic_dec_and_test(&exp->use))
1221 kmem_cache_free(nf_conntrack_expect_cachep, exp);
1222}
1223
1224static void nf_conntrack_expect_insert(struct nf_conntrack_expect *exp)
1225{
dc808fe2
HW
1226 struct nf_conn_help *master_help = nfct_help(exp->master);
1227
9fb9cbb1 1228 atomic_inc(&exp->use);
dc808fe2 1229 master_help->expecting++;
9fb9cbb1
YK
1230 list_add(&exp->list, &nf_conntrack_expect_list);
1231
1232 init_timer(&exp->timeout);
1233 exp->timeout.data = (unsigned long)exp;
1234 exp->timeout.function = expectation_timed_out;
dc808fe2 1235 exp->timeout.expires = jiffies + master_help->helper->timeout * HZ;
9fb9cbb1
YK
1236 add_timer(&exp->timeout);
1237
c1d10adb 1238 exp->id = ++nf_conntrack_expect_next_id;
9fb9cbb1
YK
1239 atomic_inc(&exp->use);
1240 NF_CT_STAT_INC(expect_create);
1241}
1242
1243/* Race with expectations being used means we could have none to find; OK. */
1244static void evict_oldest_expect(struct nf_conn *master)
1245{
1246 struct nf_conntrack_expect *i;
1247
1248 list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1249 if (i->master == master) {
1250 if (del_timer(&i->timeout)) {
1251 nf_ct_unlink_expect(i);
1252 nf_conntrack_expect_put(i);
1253 }
1254 break;
1255 }
1256 }
1257}
1258
1259static inline int refresh_timer(struct nf_conntrack_expect *i)
1260{
dc808fe2
HW
1261 struct nf_conn_help *master_help = nfct_help(i->master);
1262
9fb9cbb1
YK
1263 if (!del_timer(&i->timeout))
1264 return 0;
1265
dc808fe2 1266 i->timeout.expires = jiffies + master_help->helper->timeout*HZ;
9fb9cbb1
YK
1267 add_timer(&i->timeout);
1268 return 1;
1269}
1270
1271int nf_conntrack_expect_related(struct nf_conntrack_expect *expect)
1272{
1273 struct nf_conntrack_expect *i;
d695aa8a 1274 struct nf_conn *master = expect->master;
dc808fe2 1275 struct nf_conn_help *master_help = nfct_help(master);
9fb9cbb1
YK
1276 int ret;
1277
dc808fe2
HW
1278 NF_CT_ASSERT(master_help);
1279
9fb9cbb1
YK
1280 DEBUGP("nf_conntrack_expect_related %p\n", related_to);
1281 DEBUGP("tuple: "); NF_CT_DUMP_TUPLE(&expect->tuple);
1282 DEBUGP("mask: "); NF_CT_DUMP_TUPLE(&expect->mask);
1283
1284 write_lock_bh(&nf_conntrack_lock);
1285 list_for_each_entry(i, &nf_conntrack_expect_list, list) {
1286 if (expect_matches(i, expect)) {
1287 /* Refresh timer: if it's dying, ignore.. */
1288 if (refresh_timer(i)) {
1289 ret = 0;
1290 goto out;
1291 }
1292 } else if (expect_clash(i, expect)) {
1293 ret = -EBUSY;
1294 goto out;
1295 }
1296 }
1297 /* Will be over limit? */
dc808fe2
HW
1298 if (master_help->helper->max_expected &&
1299 master_help->expecting >= master_help->helper->max_expected)
d695aa8a 1300 evict_oldest_expect(master);
9fb9cbb1
YK
1301
1302 nf_conntrack_expect_insert(expect);
1303 nf_conntrack_expect_event(IPEXP_NEW, expect);
1304 ret = 0;
1305out:
1306 write_unlock_bh(&nf_conntrack_lock);
1307 return ret;
1308}
1309
9fb9cbb1
YK
1310int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
1311{
1312 int ret;
1313 BUG_ON(me->timeout == 0);
1314
1315 ret = nf_conntrack_register_cache(NF_CT_F_HELP, "nf_conntrack:help",
1316 sizeof(struct nf_conn)
dc808fe2
HW
1317 + sizeof(struct nf_conn_help)
1318 + __alignof__(struct nf_conn_help));
9fb9cbb1
YK
1319 if (ret < 0) {
1320 printk(KERN_ERR "nf_conntrack_helper_reigster: Unable to create slab cache for conntracks\n");
1321 return ret;
1322 }
1323 write_lock_bh(&nf_conntrack_lock);
1324 list_prepend(&helpers, me);
1325 write_unlock_bh(&nf_conntrack_lock);
1326
1327 return 0;
1328}
1329
c1d10adb
PNA
1330struct nf_conntrack_helper *
1331__nf_conntrack_helper_find_byname(const char *name)
1332{
1333 struct nf_conntrack_helper *h;
1334
1335 list_for_each_entry(h, &helpers, list) {
1336 if (!strcmp(h->name, name))
1337 return h;
1338 }
1339
1340 return NULL;
1341}
1342
9fb9cbb1
YK
1343static inline int unhelp(struct nf_conntrack_tuple_hash *i,
1344 const struct nf_conntrack_helper *me)
1345{
dc808fe2
HW
1346 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i);
1347 struct nf_conn_help *help = nfct_help(ct);
1348
1349 if (help && help->helper == me) {
1350 nf_conntrack_event(IPCT_HELPER, ct);
1351 help->helper = NULL;
9fb9cbb1
YK
1352 }
1353 return 0;
1354}
1355
1356void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
1357{
1358 unsigned int i;
1359 struct nf_conntrack_expect *exp, *tmp;
1360
1361 /* Need write lock here, to delete helper. */
1362 write_lock_bh(&nf_conntrack_lock);
1363 LIST_DELETE(&helpers, me);
1364
1365 /* Get rid of expectations */
1366 list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, list) {
dc808fe2
HW
1367 struct nf_conn_help *help = nfct_help(exp->master);
1368 if (help->helper == me && del_timer(&exp->timeout)) {
9fb9cbb1
YK
1369 nf_ct_unlink_expect(exp);
1370 nf_conntrack_expect_put(exp);
1371 }
1372 }
1373
1374 /* Get rid of expecteds, set helpers to NULL. */
1375 LIST_FIND_W(&unconfirmed, unhelp, struct nf_conntrack_tuple_hash*, me);
1376 for (i = 0; i < nf_conntrack_htable_size; i++)
1377 LIST_FIND_W(&nf_conntrack_hash[i], unhelp,
1378 struct nf_conntrack_tuple_hash *, me);
1379 write_unlock_bh(&nf_conntrack_lock);
1380
1381 /* Someone could be still looking at the helper in a bh. */
1382 synchronize_net();
1383}
1384
1385/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1386void __nf_ct_refresh_acct(struct nf_conn *ct,
1387 enum ip_conntrack_info ctinfo,
1388 const struct sk_buff *skb,
1389 unsigned long extra_jiffies,
1390 int do_acct)
1391{
1392 int event = 0;
1393
1394 NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
1395 NF_CT_ASSERT(skb);
1396
1397 write_lock_bh(&nf_conntrack_lock);
1398
1399 /* If not in hash table, timer will not be active yet */
1400 if (!nf_ct_is_confirmed(ct)) {
1401 ct->timeout.expires = extra_jiffies;
1402 event = IPCT_REFRESH;
1403 } else {
1404 /* Need del_timer for race avoidance (may already be dying). */
1405 if (del_timer(&ct->timeout)) {
1406 ct->timeout.expires = jiffies + extra_jiffies;
1407 add_timer(&ct->timeout);
1408 event = IPCT_REFRESH;
1409 }
1410 }
1411
1412#ifdef CONFIG_NF_CT_ACCT
1413 if (do_acct) {
1414 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1415 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1416 skb->len - (unsigned int)(skb->nh.raw - skb->data);
1417 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1418 || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1419 event |= IPCT_COUNTER_FILLING;
1420 }
1421#endif
1422
1423 write_unlock_bh(&nf_conntrack_lock);
1424
1425 /* must be unlocked when calling event cache */
1426 if (event)
1427 nf_conntrack_event_cache(event, skb);
1428}
1429
c1d10adb
PNA
1430#if defined(CONFIG_NF_CT_NETLINK) || \
1431 defined(CONFIG_NF_CT_NETLINK_MODULE)
1432
1433#include <linux/netfilter/nfnetlink.h>
1434#include <linux/netfilter/nfnetlink_conntrack.h>
57b47a53
IM
1435#include <linux/mutex.h>
1436
c1d10adb
PNA
1437
1438/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1439 * in ip_conntrack_core, since we don't want the protocols to autoload
1440 * or depend on ctnetlink */
1441int nf_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1442 const struct nf_conntrack_tuple *tuple)
1443{
1444 NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1445 &tuple->src.u.tcp.port);
1446 NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1447 &tuple->dst.u.tcp.port);
1448 return 0;
1449
1450nfattr_failure:
1451 return -1;
1452}
1453
1454static const size_t cta_min_proto[CTA_PROTO_MAX] = {
1455 [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t),
1456 [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t)
1457};
1458
1459int nf_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1460 struct nf_conntrack_tuple *t)
1461{
1462 if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1463 return -EINVAL;
1464
1465 if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
1466 return -EINVAL;
1467
1468 t->src.u.tcp.port =
1469 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1470 t->dst.u.tcp.port =
1471 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1472
1473 return 0;
1474}
1475#endif
1476
9fb9cbb1
YK
1477/* Used by ipt_REJECT and ip6t_REJECT. */
1478void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1479{
1480 struct nf_conn *ct;
1481 enum ip_conntrack_info ctinfo;
1482
1483 /* This ICMP is in reverse direction to the packet which caused it */
1484 ct = nf_ct_get(skb, &ctinfo);
1485 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1486 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1487 else
1488 ctinfo = IP_CT_RELATED;
1489
1490 /* Attach to new skbuff, and increment count */
1491 nskb->nfct = &ct->ct_general;
1492 nskb->nfctinfo = ctinfo;
1493 nf_conntrack_get(nskb->nfct);
1494}
1495
1496static inline int
1497do_iter(const struct nf_conntrack_tuple_hash *i,
1498 int (*iter)(struct nf_conn *i, void *data),
1499 void *data)
1500{
1501 return iter(nf_ct_tuplehash_to_ctrack(i), data);
1502}
1503
1504/* Bring out ya dead! */
1505static struct nf_conntrack_tuple_hash *
1506get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
1507 void *data, unsigned int *bucket)
1508{
1509 struct nf_conntrack_tuple_hash *h = NULL;
1510
1511 write_lock_bh(&nf_conntrack_lock);
1512 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
1513 h = LIST_FIND_W(&nf_conntrack_hash[*bucket], do_iter,
1514 struct nf_conntrack_tuple_hash *, iter, data);
1515 if (h)
1516 break;
1517 }
1518 if (!h)
1519 h = LIST_FIND_W(&unconfirmed, do_iter,
1520 struct nf_conntrack_tuple_hash *, iter, data);
1521 if (h)
1522 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
1523 write_unlock_bh(&nf_conntrack_lock);
1524
1525 return h;
1526}
1527
1528void
1529nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data)
1530{
1531 struct nf_conntrack_tuple_hash *h;
1532 unsigned int bucket = 0;
1533
1534 while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1535 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
1536 /* Time to push up daises... */
1537 if (del_timer(&ct->timeout))
1538 death_by_timeout((unsigned long)ct);
1539 /* ... else the timer will get him soon. */
1540
1541 nf_ct_put(ct);
1542 }
1543}
1544
1545static int kill_all(struct nf_conn *i, void *data)
1546{
1547 return 1;
1548}
1549
1550static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size)
1551{
1552 if (vmalloced)
1553 vfree(hash);
1554 else
1555 free_pages((unsigned long)hash,
1556 get_order(sizeof(struct list_head) * size));
1557}
1558
c1d10adb
PNA
1559void nf_conntrack_flush()
1560{
1561 nf_ct_iterate_cleanup(kill_all, NULL);
1562}
1563
9fb9cbb1
YK
1564/* Mishearing the voices in his head, our hero wonders how he's
1565 supposed to kill the mall. */
1566void nf_conntrack_cleanup(void)
1567{
1568 int i;
1569
7d3cdc6b
YK
1570 ip_ct_attach = NULL;
1571
9fb9cbb1
YK
1572 /* This makes sure all current packets have passed through
1573 netfilter framework. Roll on, two-stage module
1574 delete... */
1575 synchronize_net();
1576
1577 nf_ct_event_cache_flush();
1578 i_see_dead_people:
c1d10adb 1579 nf_conntrack_flush();
9fb9cbb1
YK
1580 if (atomic_read(&nf_conntrack_count) != 0) {
1581 schedule();
1582 goto i_see_dead_people;
1583 }
6636568c
PM
1584 /* wait until all references to nf_conntrack_untracked are dropped */
1585 while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1)
1586 schedule();
9fb9cbb1
YK
1587
1588 for (i = 0; i < NF_CT_F_NUM; i++) {
1589 if (nf_ct_cache[i].use == 0)
1590 continue;
1591
1592 NF_CT_ASSERT(nf_ct_cache[i].use == 1);
1593 nf_ct_cache[i].use = 1;
1594 nf_conntrack_unregister_cache(i);
1595 }
1596 kmem_cache_destroy(nf_conntrack_expect_cachep);
1597 free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1598 nf_conntrack_htable_size);
5a6f294e
KK
1599
1600 /* free l3proto protocol tables */
1601 for (i = 0; i < PF_MAX; i++)
1602 if (nf_ct_protos[i]) {
1603 kfree(nf_ct_protos[i]);
1604 nf_ct_protos[i] = NULL;
1605 }
9fb9cbb1
YK
1606}
1607
1608static struct list_head *alloc_hashtable(int size, int *vmalloced)
1609{
1610 struct list_head *hash;
1611 unsigned int i;
1612
1613 *vmalloced = 0;
1614 hash = (void*)__get_free_pages(GFP_KERNEL,
1615 get_order(sizeof(struct list_head)
1616 * size));
1617 if (!hash) {
1618 *vmalloced = 1;
1619 printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
1620 hash = vmalloc(sizeof(struct list_head) * size);
1621 }
1622
1623 if (hash)
1624 for (i = 0; i < size; i++)
1625 INIT_LIST_HEAD(&hash[i]);
1626
1627 return hash;
1628}
1629
1630int set_hashsize(const char *val, struct kernel_param *kp)
1631{
1632 int i, bucket, hashsize, vmalloced;
1633 int old_vmalloced, old_size;
1634 int rnd;
1635 struct list_head *hash, *old_hash;
1636 struct nf_conntrack_tuple_hash *h;
1637
1638 /* On boot, we can set this without any fancy locking. */
1639 if (!nf_conntrack_htable_size)
1640 return param_set_uint(val, kp);
1641
1642 hashsize = simple_strtol(val, NULL, 0);
1643 if (!hashsize)
1644 return -EINVAL;
1645
1646 hash = alloc_hashtable(hashsize, &vmalloced);
1647 if (!hash)
1648 return -ENOMEM;
1649
1650 /* We have to rehahs for the new table anyway, so we also can
1651 * use a newrandom seed */
1652 get_random_bytes(&rnd, 4);
1653
1654 write_lock_bh(&nf_conntrack_lock);
1655 for (i = 0; i < nf_conntrack_htable_size; i++) {
1656 while (!list_empty(&nf_conntrack_hash[i])) {
1657 h = list_entry(nf_conntrack_hash[i].next,
1658 struct nf_conntrack_tuple_hash, list);
1659 list_del(&h->list);
1660 bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1661 list_add_tail(&h->list, &hash[bucket]);
1662 }
1663 }
1664 old_size = nf_conntrack_htable_size;
1665 old_vmalloced = nf_conntrack_vmalloc;
1666 old_hash = nf_conntrack_hash;
1667
1668 nf_conntrack_htable_size = hashsize;
1669 nf_conntrack_vmalloc = vmalloced;
1670 nf_conntrack_hash = hash;
1671 nf_conntrack_hash_rnd = rnd;
1672 write_unlock_bh(&nf_conntrack_lock);
1673
1674 free_conntrack_hash(old_hash, old_vmalloced, old_size);
1675 return 0;
1676}
1677
1678module_param_call(hashsize, set_hashsize, param_get_uint,
1679 &nf_conntrack_htable_size, 0600);
1680
1681int __init nf_conntrack_init(void)
1682{
1683 unsigned int i;
1684 int ret;
1685
1686 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1687 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1688 if (!nf_conntrack_htable_size) {
1689 nf_conntrack_htable_size
1690 = (((num_physpages << PAGE_SHIFT) / 16384)
1691 / sizeof(struct list_head));
1692 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1693 nf_conntrack_htable_size = 8192;
1694 if (nf_conntrack_htable_size < 16)
1695 nf_conntrack_htable_size = 16;
1696 }
1697 nf_conntrack_max = 8 * nf_conntrack_htable_size;
1698
1699 printk("nf_conntrack version %s (%u buckets, %d max)\n",
1700 NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
1701 nf_conntrack_max);
1702
1703 nf_conntrack_hash = alloc_hashtable(nf_conntrack_htable_size,
1704 &nf_conntrack_vmalloc);
1705 if (!nf_conntrack_hash) {
1706 printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
1707 goto err_out;
1708 }
1709
1710 ret = nf_conntrack_register_cache(NF_CT_F_BASIC, "nf_conntrack:basic",
dc808fe2 1711 sizeof(struct nf_conn));
9fb9cbb1
YK
1712 if (ret < 0) {
1713 printk(KERN_ERR "Unable to create nf_conn slab cache\n");
1714 goto err_free_hash;
1715 }
1716
1717 nf_conntrack_expect_cachep = kmem_cache_create("nf_conntrack_expect",
1718 sizeof(struct nf_conntrack_expect),
1719 0, 0, NULL, NULL);
1720 if (!nf_conntrack_expect_cachep) {
1721 printk(KERN_ERR "Unable to create nf_expect slab cache\n");
1722 goto err_free_conntrack_slab;
1723 }
1724
1725 /* Don't NEED lock here, but good form anyway. */
1726 write_lock_bh(&nf_conntrack_lock);
1727 for (i = 0; i < PF_MAX; i++)
1728 nf_ct_l3protos[i] = &nf_conntrack_generic_l3proto;
1729 write_unlock_bh(&nf_conntrack_lock);
1730
7d3cdc6b
YK
1731 /* For use by REJECT target */
1732 ip_ct_attach = __nf_conntrack_attach;
1733
9fb9cbb1
YK
1734 /* Set up fake conntrack:
1735 - to never be deleted, not in any hashes */
1736 atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
1737 /* - and look it like as a confirmed connection */
1738 set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
1739
1740 return ret;
1741
1742err_free_conntrack_slab:
1743 nf_conntrack_unregister_cache(NF_CT_F_BASIC);
1744err_free_hash:
1745 free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1746 nf_conntrack_htable_size);
1747err_out:
1748 return -ENOMEM;
1749}