]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - net/netfilter/nf_conntrack_core.c
[PATCH] Notifier chain update: API changes
[mirror_ubuntu-bionic-kernel.git] / net / netfilter / nf_conntrack_core.c
CommitLineData
9fb9cbb1
YK
1/* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
4
5/* (C) 1999-2001 Paul `Rusty' Russell
dc808fe2 6 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
9fb9cbb1
YK
7 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
14 * - new API and handling of conntrack/nat helpers
15 * - now capable of multiple expectations for one master
16 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
17 * - add usage/reference counts to ip_conntrack_expect
18 * - export ip_conntrack[_expect]_{find_get,put} functions
19 * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
20 * - generalize L3 protocol denendent part.
21 * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
22 * - add support various size of conntrack structures.
dc808fe2
HW
23 * 26 Jan 2006: Harald Welte <laforge@netfilter.org>
24 * - restructure nf_conn (introduce nf_conn_help)
25 * - redesign 'features' how they were originally intended
b9f78f9f
PNA
26 * 26 Feb 2006: Pablo Neira Ayuso <pablo@eurodev.net>
27 * - add support for L3 protocol module load on demand.
9fb9cbb1
YK
28 *
29 * Derived from net/ipv4/netfilter/ip_conntrack_core.c
30 */
31
32#include <linux/config.h>
33#include <linux/types.h>
34#include <linux/netfilter.h>
35#include <linux/module.h>
36#include <linux/skbuff.h>
37#include <linux/proc_fs.h>
38#include <linux/vmalloc.h>
39#include <linux/stddef.h>
40#include <linux/slab.h>
41#include <linux/random.h>
42#include <linux/jhash.h>
43#include <linux/err.h>
44#include <linux/percpu.h>
45#include <linux/moduleparam.h>
46#include <linux/notifier.h>
47#include <linux/kernel.h>
48#include <linux/netdevice.h>
49#include <linux/socket.h>
50
51/* This rwlock protects the main hash table, protocol/helper/expected
52 registrations, conntrack timers*/
53#define ASSERT_READ_LOCK(x)
54#define ASSERT_WRITE_LOCK(x)
55
56#include <net/netfilter/nf_conntrack.h>
57#include <net/netfilter/nf_conntrack_l3proto.h>
58#include <net/netfilter/nf_conntrack_protocol.h>
59#include <net/netfilter/nf_conntrack_helper.h>
60#include <net/netfilter/nf_conntrack_core.h>
61#include <linux/netfilter_ipv4/listhelp.h>
62
dc808fe2 63#define NF_CONNTRACK_VERSION "0.5.0"
9fb9cbb1
YK
64
65#if 0
66#define DEBUGP printk
67#else
68#define DEBUGP(format, args...)
69#endif
70
71DEFINE_RWLOCK(nf_conntrack_lock);
72
73/* nf_conntrack_standalone needs this */
74atomic_t nf_conntrack_count = ATOMIC_INIT(0);
75
76void (*nf_conntrack_destroyed)(struct nf_conn *conntrack) = NULL;
77LIST_HEAD(nf_conntrack_expect_list);
78struct nf_conntrack_protocol **nf_ct_protos[PF_MAX];
79struct nf_conntrack_l3proto *nf_ct_l3protos[PF_MAX];
80static LIST_HEAD(helpers);
81unsigned int nf_conntrack_htable_size = 0;
82int nf_conntrack_max;
83struct list_head *nf_conntrack_hash;
84static kmem_cache_t *nf_conntrack_expect_cachep;
85struct nf_conn nf_conntrack_untracked;
86unsigned int nf_ct_log_invalid;
87static LIST_HEAD(unconfirmed);
88static int nf_conntrack_vmalloc;
89
4e3882f7
PNA
90static unsigned int nf_conntrack_next_id;
91static unsigned int nf_conntrack_expect_next_id;
9fb9cbb1 92#ifdef CONFIG_NF_CONNTRACK_EVENTS
e041c683
AS
93ATOMIC_NOTIFIER_HEAD(nf_conntrack_chain);
94ATOMIC_NOTIFIER_HEAD(nf_conntrack_expect_chain);
9fb9cbb1
YK
95
96DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
97
98/* deliver cached events and clear cache entry - must be called with locally
99 * disabled softirqs */
100static inline void
101__nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
102{
103 DEBUGP("ecache: delivering events for %p\n", ecache->ct);
104 if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
105 && ecache->events)
e041c683 106 atomic_notifier_call_chain(&nf_conntrack_chain, ecache->events,
9fb9cbb1
YK
107 ecache->ct);
108
109 ecache->events = 0;
110 nf_ct_put(ecache->ct);
111 ecache->ct = NULL;
112}
113
114/* Deliver all cached events for a particular conntrack. This is called
115 * by code prior to async packet handling for freeing the skb */
116void nf_ct_deliver_cached_events(const struct nf_conn *ct)
117{
118 struct nf_conntrack_ecache *ecache;
119
120 local_bh_disable();
121 ecache = &__get_cpu_var(nf_conntrack_ecache);
122 if (ecache->ct == ct)
123 __nf_ct_deliver_cached_events(ecache);
124 local_bh_enable();
125}
126
127/* Deliver cached events for old pending events, if current conntrack != old */
128void __nf_ct_event_cache_init(struct nf_conn *ct)
129{
130 struct nf_conntrack_ecache *ecache;
131
132 /* take care of delivering potentially old events */
133 ecache = &__get_cpu_var(nf_conntrack_ecache);
134 BUG_ON(ecache->ct == ct);
135 if (ecache->ct)
136 __nf_ct_deliver_cached_events(ecache);
137 /* initialize for this conntrack/packet */
138 ecache->ct = ct;
139 nf_conntrack_get(&ct->ct_general);
140}
141
142/* flush the event cache - touches other CPU's data and must not be called
143 * while packets are still passing through the code */
144static void nf_ct_event_cache_flush(void)
145{
146 struct nf_conntrack_ecache *ecache;
147 int cpu;
148
149 for_each_cpu(cpu) {
150 ecache = &per_cpu(nf_conntrack_ecache, cpu);
151 if (ecache->ct)
152 nf_ct_put(ecache->ct);
153 }
154}
155#else
156static inline void nf_ct_event_cache_flush(void) {}
157#endif /* CONFIG_NF_CONNTRACK_EVENTS */
158
159DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
160EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat);
161
162/*
163 * This scheme offers various size of "struct nf_conn" dependent on
164 * features(helper, nat, ...)
165 */
166
167#define NF_CT_FEATURES_NAMELEN 256
168static struct {
169 /* name of slab cache. printed in /proc/slabinfo */
170 char *name;
171
172 /* size of slab cache */
173 size_t size;
174
175 /* slab cache pointer */
176 kmem_cache_t *cachep;
177
178 /* allocated slab cache + modules which uses this slab cache */
179 int use;
180
181 /* Initialization */
182 int (*init_conntrack)(struct nf_conn *, u_int32_t);
183
184} nf_ct_cache[NF_CT_F_NUM];
185
186/* protect members of nf_ct_cache except of "use" */
187DEFINE_RWLOCK(nf_ct_cache_lock);
188
189/* This avoids calling kmem_cache_create() with same name simultaneously */
57b47a53 190static DEFINE_MUTEX(nf_ct_cache_mutex);
9fb9cbb1
YK
191
192extern struct nf_conntrack_protocol nf_conntrack_generic_protocol;
193struct nf_conntrack_protocol *
c1d10adb 194__nf_ct_proto_find(u_int16_t l3proto, u_int8_t protocol)
9fb9cbb1 195{
ddc8d029 196 if (unlikely(l3proto >= AF_MAX || nf_ct_protos[l3proto] == NULL))
9fb9cbb1
YK
197 return &nf_conntrack_generic_protocol;
198
199 return nf_ct_protos[l3proto][protocol];
200}
201
c1d10adb
PNA
202/* this is guaranteed to always return a valid protocol helper, since
203 * it falls back to generic_protocol */
204struct nf_conntrack_protocol *
205nf_ct_proto_find_get(u_int16_t l3proto, u_int8_t protocol)
206{
207 struct nf_conntrack_protocol *p;
208
209 preempt_disable();
210 p = __nf_ct_proto_find(l3proto, protocol);
211 if (p) {
212 if (!try_module_get(p->me))
213 p = &nf_conntrack_generic_protocol;
214 }
215 preempt_enable();
216
217 return p;
218}
219
220void nf_ct_proto_put(struct nf_conntrack_protocol *p)
221{
222 module_put(p->me);
223}
224
225struct nf_conntrack_l3proto *
226nf_ct_l3proto_find_get(u_int16_t l3proto)
227{
228 struct nf_conntrack_l3proto *p;
229
230 preempt_disable();
231 p = __nf_ct_l3proto_find(l3proto);
232 if (p) {
233 if (!try_module_get(p->me))
234 p = &nf_conntrack_generic_l3proto;
235 }
236 preempt_enable();
237
238 return p;
239}
240
241void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p)
242{
243 module_put(p->me);
244}
245
b9f78f9f
PNA
246int
247nf_ct_l3proto_try_module_get(unsigned short l3proto)
248{
249 int ret;
250 struct nf_conntrack_l3proto *p;
251
252retry: p = nf_ct_l3proto_find_get(l3proto);
253 if (p == &nf_conntrack_generic_l3proto) {
254 ret = request_module("nf_conntrack-%d", l3proto);
255 if (!ret)
256 goto retry;
257
258 return -EPROTOTYPE;
259 }
260
261 return 0;
262}
263
264void nf_ct_l3proto_module_put(unsigned short l3proto)
265{
266 struct nf_conntrack_l3proto *p;
267
268 preempt_disable();
269 p = __nf_ct_l3proto_find(l3proto);
270 preempt_enable();
271
272 module_put(p->me);
273}
274
9fb9cbb1
YK
275static int nf_conntrack_hash_rnd_initted;
276static unsigned int nf_conntrack_hash_rnd;
277
278static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
279 unsigned int size, unsigned int rnd)
280{
281 unsigned int a, b;
282 a = jhash((void *)tuple->src.u3.all, sizeof(tuple->src.u3.all),
283 ((tuple->src.l3num) << 16) | tuple->dst.protonum);
284 b = jhash((void *)tuple->dst.u3.all, sizeof(tuple->dst.u3.all),
285 (tuple->src.u.all << 16) | tuple->dst.u.all);
286
287 return jhash_2words(a, b, rnd) % size;
288}
289
290static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple)
291{
292 return __hash_conntrack(tuple, nf_conntrack_htable_size,
293 nf_conntrack_hash_rnd);
294}
295
9fb9cbb1 296int nf_conntrack_register_cache(u_int32_t features, const char *name,
dc808fe2 297 size_t size)
9fb9cbb1
YK
298{
299 int ret = 0;
300 char *cache_name;
301 kmem_cache_t *cachep;
302
303 DEBUGP("nf_conntrack_register_cache: features=0x%x, name=%s, size=%d\n",
304 features, name, size);
305
306 if (features < NF_CT_F_BASIC || features >= NF_CT_F_NUM) {
307 DEBUGP("nf_conntrack_register_cache: invalid features.: 0x%x\n",
308 features);
309 return -EINVAL;
310 }
311
57b47a53 312 mutex_lock(&nf_ct_cache_mutex);
9fb9cbb1
YK
313
314 write_lock_bh(&nf_ct_cache_lock);
315 /* e.g: multiple helpers are loaded */
316 if (nf_ct_cache[features].use > 0) {
317 DEBUGP("nf_conntrack_register_cache: already resisterd.\n");
318 if ((!strncmp(nf_ct_cache[features].name, name,
319 NF_CT_FEATURES_NAMELEN))
dc808fe2 320 && nf_ct_cache[features].size == size) {
9fb9cbb1
YK
321 DEBUGP("nf_conntrack_register_cache: reusing.\n");
322 nf_ct_cache[features].use++;
323 ret = 0;
324 } else
325 ret = -EBUSY;
326
327 write_unlock_bh(&nf_ct_cache_lock);
57b47a53 328 mutex_unlock(&nf_ct_cache_mutex);
9fb9cbb1
YK
329 return ret;
330 }
331 write_unlock_bh(&nf_ct_cache_lock);
332
333 /*
334 * The memory space for name of slab cache must be alive until
335 * cache is destroyed.
336 */
337 cache_name = kmalloc(sizeof(char)*NF_CT_FEATURES_NAMELEN, GFP_ATOMIC);
338 if (cache_name == NULL) {
339 DEBUGP("nf_conntrack_register_cache: can't alloc cache_name\n");
340 ret = -ENOMEM;
341 goto out_up_mutex;
342 }
343
344 if (strlcpy(cache_name, name, NF_CT_FEATURES_NAMELEN)
345 >= NF_CT_FEATURES_NAMELEN) {
346 printk("nf_conntrack_register_cache: name too long\n");
347 ret = -EINVAL;
348 goto out_free_name;
349 }
350
351 cachep = kmem_cache_create(cache_name, size, 0, 0,
352 NULL, NULL);
353 if (!cachep) {
354 printk("nf_conntrack_register_cache: Can't create slab cache "
355 "for the features = 0x%x\n", features);
356 ret = -ENOMEM;
357 goto out_free_name;
358 }
359
360 write_lock_bh(&nf_ct_cache_lock);
361 nf_ct_cache[features].use = 1;
362 nf_ct_cache[features].size = size;
9fb9cbb1
YK
363 nf_ct_cache[features].cachep = cachep;
364 nf_ct_cache[features].name = cache_name;
365 write_unlock_bh(&nf_ct_cache_lock);
366
367 goto out_up_mutex;
368
369out_free_name:
370 kfree(cache_name);
371out_up_mutex:
57b47a53 372 mutex_unlock(&nf_ct_cache_mutex);
9fb9cbb1
YK
373 return ret;
374}
375
376/* FIXME: In the current, only nf_conntrack_cleanup() can call this function. */
377void nf_conntrack_unregister_cache(u_int32_t features)
378{
379 kmem_cache_t *cachep;
380 char *name;
381
382 /*
383 * This assures that kmem_cache_create() isn't called before destroying
384 * slab cache.
385 */
386 DEBUGP("nf_conntrack_unregister_cache: 0x%04x\n", features);
57b47a53 387 mutex_lock(&nf_ct_cache_mutex);
9fb9cbb1
YK
388
389 write_lock_bh(&nf_ct_cache_lock);
390 if (--nf_ct_cache[features].use > 0) {
391 write_unlock_bh(&nf_ct_cache_lock);
57b47a53 392 mutex_unlock(&nf_ct_cache_mutex);
9fb9cbb1
YK
393 return;
394 }
395 cachep = nf_ct_cache[features].cachep;
396 name = nf_ct_cache[features].name;
397 nf_ct_cache[features].cachep = NULL;
398 nf_ct_cache[features].name = NULL;
9fb9cbb1
YK
399 nf_ct_cache[features].size = 0;
400 write_unlock_bh(&nf_ct_cache_lock);
401
402 synchronize_net();
403
404 kmem_cache_destroy(cachep);
405 kfree(name);
406
57b47a53 407 mutex_unlock(&nf_ct_cache_mutex);
9fb9cbb1
YK
408}
409
410int
411nf_ct_get_tuple(const struct sk_buff *skb,
412 unsigned int nhoff,
413 unsigned int dataoff,
414 u_int16_t l3num,
415 u_int8_t protonum,
416 struct nf_conntrack_tuple *tuple,
417 const struct nf_conntrack_l3proto *l3proto,
418 const struct nf_conntrack_protocol *protocol)
419{
420 NF_CT_TUPLE_U_BLANK(tuple);
421
422 tuple->src.l3num = l3num;
423 if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
424 return 0;
425
426 tuple->dst.protonum = protonum;
427 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
428
429 return protocol->pkt_to_tuple(skb, dataoff, tuple);
430}
431
432int
433nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
434 const struct nf_conntrack_tuple *orig,
435 const struct nf_conntrack_l3proto *l3proto,
436 const struct nf_conntrack_protocol *protocol)
437{
438 NF_CT_TUPLE_U_BLANK(inverse);
439
440 inverse->src.l3num = orig->src.l3num;
441 if (l3proto->invert_tuple(inverse, orig) == 0)
442 return 0;
443
444 inverse->dst.dir = !orig->dst.dir;
445
446 inverse->dst.protonum = orig->dst.protonum;
447 return protocol->invert_tuple(inverse, orig);
448}
449
450/* nf_conntrack_expect helper functions */
c1d10adb 451void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
9fb9cbb1 452{
dc808fe2
HW
453 struct nf_conn_help *master_help = nfct_help(exp->master);
454
455 NF_CT_ASSERT(master_help);
9fb9cbb1 456 ASSERT_WRITE_LOCK(&nf_conntrack_lock);
4a59a810 457 NF_CT_ASSERT(!timer_pending(&exp->timeout));
dc808fe2 458
9fb9cbb1
YK
459 list_del(&exp->list);
460 NF_CT_STAT_INC(expect_delete);
dc808fe2 461 master_help->expecting--;
9fb9cbb1
YK
462 nf_conntrack_expect_put(exp);
463}
464
465static void expectation_timed_out(unsigned long ul_expect)
466{
467 struct nf_conntrack_expect *exp = (void *)ul_expect;
468
469 write_lock_bh(&nf_conntrack_lock);
470 nf_ct_unlink_expect(exp);
471 write_unlock_bh(&nf_conntrack_lock);
472 nf_conntrack_expect_put(exp);
473}
474
c1d10adb
PNA
475struct nf_conntrack_expect *
476__nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple)
477{
478 struct nf_conntrack_expect *i;
479
480 list_for_each_entry(i, &nf_conntrack_expect_list, list) {
481 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
482 atomic_inc(&i->use);
483 return i;
484 }
485 }
486 return NULL;
487}
488
489/* Just find a expectation corresponding to a tuple. */
490struct nf_conntrack_expect *
491nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple)
492{
493 struct nf_conntrack_expect *i;
494
495 read_lock_bh(&nf_conntrack_lock);
496 i = __nf_conntrack_expect_find(tuple);
497 read_unlock_bh(&nf_conntrack_lock);
498
499 return i;
500}
501
9fb9cbb1
YK
502/* If an expectation for this connection is found, it gets delete from
503 * global list then returned. */
504static struct nf_conntrack_expect *
505find_expectation(const struct nf_conntrack_tuple *tuple)
506{
507 struct nf_conntrack_expect *i;
508
509 list_for_each_entry(i, &nf_conntrack_expect_list, list) {
510 /* If master is not in hash table yet (ie. packet hasn't left
511 this machine yet), how can other end know about expected?
512 Hence these are not the droids you are looking for (if
513 master ct never got confirmed, we'd hold a reference to it
514 and weird things would happen to future packets). */
515 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
516 && nf_ct_is_confirmed(i->master)) {
517 if (i->flags & NF_CT_EXPECT_PERMANENT) {
518 atomic_inc(&i->use);
519 return i;
520 } else if (del_timer(&i->timeout)) {
521 nf_ct_unlink_expect(i);
522 return i;
523 }
524 }
525 }
526 return NULL;
527}
528
529/* delete all expectations for this conntrack */
c1d10adb 530void nf_ct_remove_expectations(struct nf_conn *ct)
9fb9cbb1
YK
531{
532 struct nf_conntrack_expect *i, *tmp;
dc808fe2 533 struct nf_conn_help *help = nfct_help(ct);
9fb9cbb1
YK
534
535 /* Optimization: most connection never expect any others. */
dc808fe2 536 if (!help || help->expecting == 0)
9fb9cbb1
YK
537 return;
538
539 list_for_each_entry_safe(i, tmp, &nf_conntrack_expect_list, list) {
540 if (i->master == ct && del_timer(&i->timeout)) {
541 nf_ct_unlink_expect(i);
542 nf_conntrack_expect_put(i);
543 }
544 }
545}
546
547static void
548clean_from_lists(struct nf_conn *ct)
549{
550 unsigned int ho, hr;
551
552 DEBUGP("clean_from_lists(%p)\n", ct);
553 ASSERT_WRITE_LOCK(&nf_conntrack_lock);
554
555 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
556 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
557 LIST_DELETE(&nf_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
558 LIST_DELETE(&nf_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
559
560 /* Destroy all pending expectations */
c1d10adb 561 nf_ct_remove_expectations(ct);
9fb9cbb1
YK
562}
563
564static void
565destroy_conntrack(struct nf_conntrack *nfct)
566{
567 struct nf_conn *ct = (struct nf_conn *)nfct;
568 struct nf_conntrack_l3proto *l3proto;
569 struct nf_conntrack_protocol *proto;
570
571 DEBUGP("destroy_conntrack(%p)\n", ct);
572 NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
573 NF_CT_ASSERT(!timer_pending(&ct->timeout));
574
575 nf_conntrack_event(IPCT_DESTROY, ct);
576 set_bit(IPS_DYING_BIT, &ct->status);
577
578 /* To make sure we don't get any weird locking issues here:
579 * destroy_conntrack() MUST NOT be called with a write lock
580 * to nf_conntrack_lock!!! -HW */
c1d10adb 581 l3proto = __nf_ct_l3proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num);
9fb9cbb1
YK
582 if (l3proto && l3proto->destroy)
583 l3proto->destroy(ct);
584
c1d10adb 585 proto = __nf_ct_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num, ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
9fb9cbb1
YK
586 if (proto && proto->destroy)
587 proto->destroy(ct);
588
589 if (nf_conntrack_destroyed)
590 nf_conntrack_destroyed(ct);
591
592 write_lock_bh(&nf_conntrack_lock);
593 /* Expectations will have been removed in clean_from_lists,
594 * except TFTP can create an expectation on the first packet,
595 * before connection is in the list, so we need to clean here,
596 * too. */
c1d10adb 597 nf_ct_remove_expectations(ct);
9fb9cbb1
YK
598
599 /* We overload first tuple to link into unconfirmed list. */
600 if (!nf_ct_is_confirmed(ct)) {
601 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
602 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
603 }
604
605 NF_CT_STAT_INC(delete);
606 write_unlock_bh(&nf_conntrack_lock);
607
608 if (ct->master)
609 nf_ct_put(ct->master);
610
611 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
612 nf_conntrack_free(ct);
613}
614
615static void death_by_timeout(unsigned long ul_conntrack)
616{
617 struct nf_conn *ct = (void *)ul_conntrack;
618
619 write_lock_bh(&nf_conntrack_lock);
620 /* Inside lock so preempt is disabled on module removal path.
621 * Otherwise we can get spurious warnings. */
622 NF_CT_STAT_INC(delete_list);
623 clean_from_lists(ct);
624 write_unlock_bh(&nf_conntrack_lock);
625 nf_ct_put(ct);
626}
627
628static inline int
629conntrack_tuple_cmp(const struct nf_conntrack_tuple_hash *i,
630 const struct nf_conntrack_tuple *tuple,
631 const struct nf_conn *ignored_conntrack)
632{
633 ASSERT_READ_LOCK(&nf_conntrack_lock);
634 return nf_ct_tuplehash_to_ctrack(i) != ignored_conntrack
635 && nf_ct_tuple_equal(tuple, &i->tuple);
636}
637
c1d10adb 638struct nf_conntrack_tuple_hash *
9fb9cbb1
YK
639__nf_conntrack_find(const struct nf_conntrack_tuple *tuple,
640 const struct nf_conn *ignored_conntrack)
641{
642 struct nf_conntrack_tuple_hash *h;
643 unsigned int hash = hash_conntrack(tuple);
644
645 ASSERT_READ_LOCK(&nf_conntrack_lock);
646 list_for_each_entry(h, &nf_conntrack_hash[hash], list) {
647 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
648 NF_CT_STAT_INC(found);
649 return h;
650 }
651 NF_CT_STAT_INC(searched);
652 }
653
654 return NULL;
655}
656
657/* Find a connection corresponding to a tuple. */
658struct nf_conntrack_tuple_hash *
659nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple,
660 const struct nf_conn *ignored_conntrack)
661{
662 struct nf_conntrack_tuple_hash *h;
663
664 read_lock_bh(&nf_conntrack_lock);
665 h = __nf_conntrack_find(tuple, ignored_conntrack);
666 if (h)
667 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
668 read_unlock_bh(&nf_conntrack_lock);
669
670 return h;
671}
672
c1d10adb
PNA
673static void __nf_conntrack_hash_insert(struct nf_conn *ct,
674 unsigned int hash,
675 unsigned int repl_hash)
676{
677 ct->id = ++nf_conntrack_next_id;
678 list_prepend(&nf_conntrack_hash[hash],
679 &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
680 list_prepend(&nf_conntrack_hash[repl_hash],
681 &ct->tuplehash[IP_CT_DIR_REPLY].list);
682}
683
684void nf_conntrack_hash_insert(struct nf_conn *ct)
685{
686 unsigned int hash, repl_hash;
687
688 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
689 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
690
691 write_lock_bh(&nf_conntrack_lock);
692 __nf_conntrack_hash_insert(ct, hash, repl_hash);
693 write_unlock_bh(&nf_conntrack_lock);
694}
695
9fb9cbb1
YK
696/* Confirm a connection given skb; places it in hash table */
697int
698__nf_conntrack_confirm(struct sk_buff **pskb)
699{
700 unsigned int hash, repl_hash;
701 struct nf_conn *ct;
702 enum ip_conntrack_info ctinfo;
703
704 ct = nf_ct_get(*pskb, &ctinfo);
705
706 /* ipt_REJECT uses nf_conntrack_attach to attach related
707 ICMP/TCP RST packets in other direction. Actual packet
708 which created connection will be IP_CT_NEW or for an
709 expected connection, IP_CT_RELATED. */
710 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
711 return NF_ACCEPT;
712
713 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
714 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
715
716 /* We're not in hash table, and we refuse to set up related
717 connections for unconfirmed conns. But packet copies and
718 REJECT will give spurious warnings here. */
719 /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
720
721 /* No external references means noone else could have
722 confirmed us. */
723 NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
724 DEBUGP("Confirming conntrack %p\n", ct);
725
726 write_lock_bh(&nf_conntrack_lock);
727
728 /* See if there's one in the list already, including reverse:
729 NAT could have grabbed it without realizing, since we're
730 not in the hash. If there is, we lost race. */
731 if (!LIST_FIND(&nf_conntrack_hash[hash],
732 conntrack_tuple_cmp,
733 struct nf_conntrack_tuple_hash *,
734 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
735 && !LIST_FIND(&nf_conntrack_hash[repl_hash],
736 conntrack_tuple_cmp,
737 struct nf_conntrack_tuple_hash *,
738 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
dc808fe2 739 struct nf_conn_help *help;
9fb9cbb1
YK
740 /* Remove from unconfirmed list */
741 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
742
c1d10adb 743 __nf_conntrack_hash_insert(ct, hash, repl_hash);
9fb9cbb1
YK
744 /* Timer relative to confirmation time, not original
745 setting time, otherwise we'd get timer wrap in
746 weird delay cases. */
747 ct->timeout.expires += jiffies;
748 add_timer(&ct->timeout);
749 atomic_inc(&ct->ct_general.use);
750 set_bit(IPS_CONFIRMED_BIT, &ct->status);
751 NF_CT_STAT_INC(insert);
752 write_unlock_bh(&nf_conntrack_lock);
dc808fe2
HW
753 help = nfct_help(ct);
754 if (help && help->helper)
9fb9cbb1
YK
755 nf_conntrack_event_cache(IPCT_HELPER, *pskb);
756#ifdef CONFIG_NF_NAT_NEEDED
757 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
758 test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
759 nf_conntrack_event_cache(IPCT_NATINFO, *pskb);
760#endif
761 nf_conntrack_event_cache(master_ct(ct) ?
762 IPCT_RELATED : IPCT_NEW, *pskb);
763 return NF_ACCEPT;
764 }
765
766 NF_CT_STAT_INC(insert_failed);
767 write_unlock_bh(&nf_conntrack_lock);
768 return NF_DROP;
769}
770
771/* Returns true if a connection correspondings to the tuple (required
772 for NAT). */
773int
774nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
775 const struct nf_conn *ignored_conntrack)
776{
777 struct nf_conntrack_tuple_hash *h;
778
779 read_lock_bh(&nf_conntrack_lock);
780 h = __nf_conntrack_find(tuple, ignored_conntrack);
781 read_unlock_bh(&nf_conntrack_lock);
782
783 return h != NULL;
784}
785
786/* There's a small race here where we may free a just-assured
787 connection. Too bad: we're in trouble anyway. */
788static inline int unreplied(const struct nf_conntrack_tuple_hash *i)
789{
790 return !(test_bit(IPS_ASSURED_BIT,
791 &nf_ct_tuplehash_to_ctrack(i)->status));
792}
793
794static int early_drop(struct list_head *chain)
795{
796 /* Traverse backwards: gives us oldest, which is roughly LRU */
797 struct nf_conntrack_tuple_hash *h;
798 struct nf_conn *ct = NULL;
799 int dropped = 0;
800
801 read_lock_bh(&nf_conntrack_lock);
802 h = LIST_FIND_B(chain, unreplied, struct nf_conntrack_tuple_hash *);
803 if (h) {
804 ct = nf_ct_tuplehash_to_ctrack(h);
805 atomic_inc(&ct->ct_general.use);
806 }
807 read_unlock_bh(&nf_conntrack_lock);
808
809 if (!ct)
810 return dropped;
811
812 if (del_timer(&ct->timeout)) {
813 death_by_timeout((unsigned long)ct);
814 dropped = 1;
815 NF_CT_STAT_INC(early_drop);
816 }
817 nf_ct_put(ct);
818 return dropped;
819}
820
821static inline int helper_cmp(const struct nf_conntrack_helper *i,
822 const struct nf_conntrack_tuple *rtuple)
823{
824 return nf_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
825}
826
827static struct nf_conntrack_helper *
c1d10adb 828__nf_ct_helper_find(const struct nf_conntrack_tuple *tuple)
9fb9cbb1
YK
829{
830 return LIST_FIND(&helpers, helper_cmp,
831 struct nf_conntrack_helper *,
832 tuple);
833}
834
c1d10adb
PNA
835struct nf_conntrack_helper *
836nf_ct_helper_find_get( const struct nf_conntrack_tuple *tuple)
837{
838 struct nf_conntrack_helper *helper;
839
840 /* need nf_conntrack_lock to assure that helper exists until
841 * try_module_get() is called */
842 read_lock_bh(&nf_conntrack_lock);
843
844 helper = __nf_ct_helper_find(tuple);
845 if (helper) {
846 /* need to increase module usage count to assure helper will
847 * not go away while the caller is e.g. busy putting a
848 * conntrack in the hash that uses the helper */
849 if (!try_module_get(helper->me))
850 helper = NULL;
851 }
852
853 read_unlock_bh(&nf_conntrack_lock);
854
855 return helper;
856}
857
858void nf_ct_helper_put(struct nf_conntrack_helper *helper)
859{
860 module_put(helper->me);
861}
862
9fb9cbb1
YK
863static struct nf_conn *
864__nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
865 const struct nf_conntrack_tuple *repl,
866 const struct nf_conntrack_l3proto *l3proto)
867{
868 struct nf_conn *conntrack = NULL;
869 u_int32_t features = 0;
dc808fe2 870 struct nf_conntrack_helper *helper;
9fb9cbb1 871
dc808fe2 872 if (unlikely(!nf_conntrack_hash_rnd_initted)) {
9fb9cbb1
YK
873 get_random_bytes(&nf_conntrack_hash_rnd, 4);
874 nf_conntrack_hash_rnd_initted = 1;
875 }
876
877 if (nf_conntrack_max
878 && atomic_read(&nf_conntrack_count) >= nf_conntrack_max) {
879 unsigned int hash = hash_conntrack(orig);
880 /* Try dropping from this hash chain. */
881 if (!early_drop(&nf_conntrack_hash[hash])) {
882 if (net_ratelimit())
883 printk(KERN_WARNING
884 "nf_conntrack: table full, dropping"
885 " packet.\n");
886 return ERR_PTR(-ENOMEM);
887 }
888 }
889
890 /* find features needed by this conntrack. */
891 features = l3proto->get_features(orig);
dc808fe2
HW
892
893 /* FIXME: protect helper list per RCU */
9fb9cbb1 894 read_lock_bh(&nf_conntrack_lock);
dc808fe2
HW
895 helper = __nf_ct_helper_find(repl);
896 if (helper)
9fb9cbb1
YK
897 features |= NF_CT_F_HELP;
898 read_unlock_bh(&nf_conntrack_lock);
899
900 DEBUGP("nf_conntrack_alloc: features=0x%x\n", features);
901
902 read_lock_bh(&nf_ct_cache_lock);
903
dc808fe2 904 if (unlikely(!nf_ct_cache[features].use)) {
9fb9cbb1
YK
905 DEBUGP("nf_conntrack_alloc: not supported features = 0x%x\n",
906 features);
907 goto out;
908 }
909
910 conntrack = kmem_cache_alloc(nf_ct_cache[features].cachep, GFP_ATOMIC);
911 if (conntrack == NULL) {
912 DEBUGP("nf_conntrack_alloc: Can't alloc conntrack from cache\n");
913 goto out;
914 }
915
916 memset(conntrack, 0, nf_ct_cache[features].size);
917 conntrack->features = features;
dc808fe2
HW
918 if (helper) {
919 struct nf_conn_help *help = nfct_help(conntrack);
920 NF_CT_ASSERT(help);
921 help->helper = helper;
9fb9cbb1
YK
922 }
923
924 atomic_set(&conntrack->ct_general.use, 1);
925 conntrack->ct_general.destroy = destroy_conntrack;
926 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
927 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
928 /* Don't set timer yet: wait for confirmation */
929 init_timer(&conntrack->timeout);
930 conntrack->timeout.data = (unsigned long)conntrack;
931 conntrack->timeout.function = death_by_timeout;
932
933 atomic_inc(&nf_conntrack_count);
934out:
935 read_unlock_bh(&nf_ct_cache_lock);
936 return conntrack;
937}
938
939struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
940 const struct nf_conntrack_tuple *repl)
941{
942 struct nf_conntrack_l3proto *l3proto;
943
c1d10adb 944 l3proto = __nf_ct_l3proto_find(orig->src.l3num);
9fb9cbb1
YK
945 return __nf_conntrack_alloc(orig, repl, l3proto);
946}
947
948void nf_conntrack_free(struct nf_conn *conntrack)
949{
950 u_int32_t features = conntrack->features;
951 NF_CT_ASSERT(features >= NF_CT_F_BASIC && features < NF_CT_F_NUM);
952 DEBUGP("nf_conntrack_free: features = 0x%x, conntrack=%p\n", features,
953 conntrack);
954 kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
955 atomic_dec(&nf_conntrack_count);
956}
957
958/* Allocate a new conntrack: we return -ENOMEM if classification
959 failed due to stress. Otherwise it really is unclassifiable. */
960static struct nf_conntrack_tuple_hash *
961init_conntrack(const struct nf_conntrack_tuple *tuple,
962 struct nf_conntrack_l3proto *l3proto,
963 struct nf_conntrack_protocol *protocol,
964 struct sk_buff *skb,
965 unsigned int dataoff)
966{
967 struct nf_conn *conntrack;
968 struct nf_conntrack_tuple repl_tuple;
969 struct nf_conntrack_expect *exp;
970
971 if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, protocol)) {
972 DEBUGP("Can't invert tuple.\n");
973 return NULL;
974 }
975
976 conntrack = __nf_conntrack_alloc(tuple, &repl_tuple, l3proto);
977 if (conntrack == NULL || IS_ERR(conntrack)) {
978 DEBUGP("Can't allocate conntrack.\n");
979 return (struct nf_conntrack_tuple_hash *)conntrack;
980 }
981
982 if (!protocol->new(conntrack, skb, dataoff)) {
983 nf_conntrack_free(conntrack);
984 DEBUGP("init conntrack: can't track with proto module\n");
985 return NULL;
986 }
987
988 write_lock_bh(&nf_conntrack_lock);
989 exp = find_expectation(tuple);
990
991 if (exp) {
992 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
993 conntrack, exp);
994 /* Welcome, Mr. Bond. We've been expecting you... */
995 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
996 conntrack->master = exp->master;
997#ifdef CONFIG_NF_CONNTRACK_MARK
998 conntrack->mark = exp->master->mark;
999#endif
1000 nf_conntrack_get(&conntrack->master->ct_general);
1001 NF_CT_STAT_INC(expect_new);
dc808fe2 1002 } else
9fb9cbb1 1003 NF_CT_STAT_INC(new);
9fb9cbb1
YK
1004
1005 /* Overload tuple linked list to put us in unconfirmed list. */
1006 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
1007
1008 write_unlock_bh(&nf_conntrack_lock);
1009
1010 if (exp) {
1011 if (exp->expectfn)
1012 exp->expectfn(conntrack, exp);
1013 nf_conntrack_expect_put(exp);
1014 }
1015
1016 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
1017}
1018
1019/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
1020static inline struct nf_conn *
1021resolve_normal_ct(struct sk_buff *skb,
1022 unsigned int dataoff,
1023 u_int16_t l3num,
1024 u_int8_t protonum,
1025 struct nf_conntrack_l3proto *l3proto,
1026 struct nf_conntrack_protocol *proto,
1027 int *set_reply,
1028 enum ip_conntrack_info *ctinfo)
1029{
1030 struct nf_conntrack_tuple tuple;
1031 struct nf_conntrack_tuple_hash *h;
1032 struct nf_conn *ct;
1033
1034 if (!nf_ct_get_tuple(skb, (unsigned int)(skb->nh.raw - skb->data),
1035 dataoff, l3num, protonum, &tuple, l3proto,
1036 proto)) {
1037 DEBUGP("resolve_normal_ct: Can't get tuple\n");
1038 return NULL;
1039 }
1040
1041 /* look for tuple match */
1042 h = nf_conntrack_find_get(&tuple, NULL);
1043 if (!h) {
1044 h = init_conntrack(&tuple, l3proto, proto, skb, dataoff);
1045 if (!h)
1046 return NULL;
1047 if (IS_ERR(h))
1048 return (void *)h;
1049 }
1050 ct = nf_ct_tuplehash_to_ctrack(h);
1051
1052 /* It exists; we have (non-exclusive) reference. */
1053 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
1054 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
1055 /* Please set reply bit if this packet OK */
1056 *set_reply = 1;
1057 } else {
1058 /* Once we've had two way comms, always ESTABLISHED. */
1059 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1060 DEBUGP("nf_conntrack_in: normal packet for %p\n", ct);
1061 *ctinfo = IP_CT_ESTABLISHED;
1062 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
1063 DEBUGP("nf_conntrack_in: related packet for %p\n", ct);
1064 *ctinfo = IP_CT_RELATED;
1065 } else {
1066 DEBUGP("nf_conntrack_in: new packet for %p\n", ct);
1067 *ctinfo = IP_CT_NEW;
1068 }
1069 *set_reply = 0;
1070 }
1071 skb->nfct = &ct->ct_general;
1072 skb->nfctinfo = *ctinfo;
1073 return ct;
1074}
1075
1076unsigned int
1077nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb)
1078{
1079 struct nf_conn *ct;
1080 enum ip_conntrack_info ctinfo;
1081 struct nf_conntrack_l3proto *l3proto;
1082 struct nf_conntrack_protocol *proto;
1083 unsigned int dataoff;
1084 u_int8_t protonum;
1085 int set_reply = 0;
1086 int ret;
1087
1088 /* Previously seen (loopback or untracked)? Ignore. */
1089 if ((*pskb)->nfct) {
1090 NF_CT_STAT_INC(ignore);
1091 return NF_ACCEPT;
1092 }
1093
c1d10adb 1094 l3proto = __nf_ct_l3proto_find((u_int16_t)pf);
9fb9cbb1
YK
1095 if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) {
1096 DEBUGP("not prepared to track yet or error occured\n");
1097 return -ret;
1098 }
1099
c1d10adb 1100 proto = __nf_ct_proto_find((u_int16_t)pf, protonum);
9fb9cbb1
YK
1101
1102 /* It may be an special packet, error, unclean...
1103 * inverse of the return code tells to the netfilter
1104 * core what to do with the packet. */
1105 if (proto->error != NULL &&
1106 (ret = proto->error(*pskb, dataoff, &ctinfo, pf, hooknum)) <= 0) {
1107 NF_CT_STAT_INC(error);
1108 NF_CT_STAT_INC(invalid);
1109 return -ret;
1110 }
1111
1112 ct = resolve_normal_ct(*pskb, dataoff, pf, protonum, l3proto, proto,
1113 &set_reply, &ctinfo);
1114 if (!ct) {
1115 /* Not valid part of a connection */
1116 NF_CT_STAT_INC(invalid);
1117 return NF_ACCEPT;
1118 }
1119
1120 if (IS_ERR(ct)) {
1121 /* Too stressed to deal. */
1122 NF_CT_STAT_INC(drop);
1123 return NF_DROP;
1124 }
1125
1126 NF_CT_ASSERT((*pskb)->nfct);
1127
1128 ret = proto->packet(ct, *pskb, dataoff, ctinfo, pf, hooknum);
1129 if (ret < 0) {
1130 /* Invalid: inverse of the return code tells
1131 * the netfilter core what to do */
1132 DEBUGP("nf_conntrack_in: Can't track with proto module\n");
1133 nf_conntrack_put((*pskb)->nfct);
1134 (*pskb)->nfct = NULL;
1135 NF_CT_STAT_INC(invalid);
1136 return -ret;
1137 }
1138
1139 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
1140 nf_conntrack_event_cache(IPCT_STATUS, *pskb);
1141
1142 return ret;
1143}
1144
1145int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
1146 const struct nf_conntrack_tuple *orig)
1147{
1148 return nf_ct_invert_tuple(inverse, orig,
c1d10adb
PNA
1149 __nf_ct_l3proto_find(orig->src.l3num),
1150 __nf_ct_proto_find(orig->src.l3num,
1151 orig->dst.protonum));
9fb9cbb1
YK
1152}
1153
1154/* Would two expected things clash? */
1155static inline int expect_clash(const struct nf_conntrack_expect *a,
1156 const struct nf_conntrack_expect *b)
1157{
1158 /* Part covered by intersection of masks must be unequal,
1159 otherwise they clash */
1160 struct nf_conntrack_tuple intersect_mask;
1161 int count;
1162
1163 intersect_mask.src.l3num = a->mask.src.l3num & b->mask.src.l3num;
1164 intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all;
1165 intersect_mask.dst.u.all = a->mask.dst.u.all & b->mask.dst.u.all;
1166 intersect_mask.dst.protonum = a->mask.dst.protonum
1167 & b->mask.dst.protonum;
1168
1169 for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1170 intersect_mask.src.u3.all[count] =
1171 a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
1172 }
1173
1174 for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1175 intersect_mask.dst.u3.all[count] =
1176 a->mask.dst.u3.all[count] & b->mask.dst.u3.all[count];
1177 }
1178
1179 return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
1180}
1181
1182static inline int expect_matches(const struct nf_conntrack_expect *a,
1183 const struct nf_conntrack_expect *b)
1184{
1185 return a->master == b->master
1186 && nf_ct_tuple_equal(&a->tuple, &b->tuple)
1187 && nf_ct_tuple_equal(&a->mask, &b->mask);
1188}
1189
1190/* Generally a bad idea to call this: could have matched already. */
1191void nf_conntrack_unexpect_related(struct nf_conntrack_expect *exp)
1192{
1193 struct nf_conntrack_expect *i;
1194
1195 write_lock_bh(&nf_conntrack_lock);
1196 /* choose the the oldest expectation to evict */
1197 list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1198 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
1199 nf_ct_unlink_expect(i);
1200 write_unlock_bh(&nf_conntrack_lock);
1201 nf_conntrack_expect_put(i);
1202 return;
1203 }
1204 }
1205 write_unlock_bh(&nf_conntrack_lock);
1206}
1207
1208/* We don't increase the master conntrack refcount for non-fulfilled
1209 * conntracks. During the conntrack destruction, the expectations are
1210 * always killed before the conntrack itself */
1211struct nf_conntrack_expect *nf_conntrack_expect_alloc(struct nf_conn *me)
1212{
1213 struct nf_conntrack_expect *new;
1214
1215 new = kmem_cache_alloc(nf_conntrack_expect_cachep, GFP_ATOMIC);
1216 if (!new) {
1217 DEBUGP("expect_related: OOM allocating expect\n");
1218 return NULL;
1219 }
1220 new->master = me;
1221 atomic_set(&new->use, 1);
1222 return new;
1223}
1224
1225void nf_conntrack_expect_put(struct nf_conntrack_expect *exp)
1226{
1227 if (atomic_dec_and_test(&exp->use))
1228 kmem_cache_free(nf_conntrack_expect_cachep, exp);
1229}
1230
1231static void nf_conntrack_expect_insert(struct nf_conntrack_expect *exp)
1232{
dc808fe2
HW
1233 struct nf_conn_help *master_help = nfct_help(exp->master);
1234
9fb9cbb1 1235 atomic_inc(&exp->use);
dc808fe2 1236 master_help->expecting++;
9fb9cbb1
YK
1237 list_add(&exp->list, &nf_conntrack_expect_list);
1238
1239 init_timer(&exp->timeout);
1240 exp->timeout.data = (unsigned long)exp;
1241 exp->timeout.function = expectation_timed_out;
dc808fe2 1242 exp->timeout.expires = jiffies + master_help->helper->timeout * HZ;
9fb9cbb1
YK
1243 add_timer(&exp->timeout);
1244
c1d10adb 1245 exp->id = ++nf_conntrack_expect_next_id;
9fb9cbb1
YK
1246 atomic_inc(&exp->use);
1247 NF_CT_STAT_INC(expect_create);
1248}
1249
1250/* Race with expectations being used means we could have none to find; OK. */
1251static void evict_oldest_expect(struct nf_conn *master)
1252{
1253 struct nf_conntrack_expect *i;
1254
1255 list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1256 if (i->master == master) {
1257 if (del_timer(&i->timeout)) {
1258 nf_ct_unlink_expect(i);
1259 nf_conntrack_expect_put(i);
1260 }
1261 break;
1262 }
1263 }
1264}
1265
1266static inline int refresh_timer(struct nf_conntrack_expect *i)
1267{
dc808fe2
HW
1268 struct nf_conn_help *master_help = nfct_help(i->master);
1269
9fb9cbb1
YK
1270 if (!del_timer(&i->timeout))
1271 return 0;
1272
dc808fe2 1273 i->timeout.expires = jiffies + master_help->helper->timeout*HZ;
9fb9cbb1
YK
1274 add_timer(&i->timeout);
1275 return 1;
1276}
1277
1278int nf_conntrack_expect_related(struct nf_conntrack_expect *expect)
1279{
1280 struct nf_conntrack_expect *i;
d695aa8a 1281 struct nf_conn *master = expect->master;
dc808fe2 1282 struct nf_conn_help *master_help = nfct_help(master);
9fb9cbb1
YK
1283 int ret;
1284
dc808fe2
HW
1285 NF_CT_ASSERT(master_help);
1286
9fb9cbb1
YK
1287 DEBUGP("nf_conntrack_expect_related %p\n", related_to);
1288 DEBUGP("tuple: "); NF_CT_DUMP_TUPLE(&expect->tuple);
1289 DEBUGP("mask: "); NF_CT_DUMP_TUPLE(&expect->mask);
1290
1291 write_lock_bh(&nf_conntrack_lock);
1292 list_for_each_entry(i, &nf_conntrack_expect_list, list) {
1293 if (expect_matches(i, expect)) {
1294 /* Refresh timer: if it's dying, ignore.. */
1295 if (refresh_timer(i)) {
1296 ret = 0;
1297 goto out;
1298 }
1299 } else if (expect_clash(i, expect)) {
1300 ret = -EBUSY;
1301 goto out;
1302 }
1303 }
1304 /* Will be over limit? */
dc808fe2
HW
1305 if (master_help->helper->max_expected &&
1306 master_help->expecting >= master_help->helper->max_expected)
d695aa8a 1307 evict_oldest_expect(master);
9fb9cbb1
YK
1308
1309 nf_conntrack_expect_insert(expect);
1310 nf_conntrack_expect_event(IPEXP_NEW, expect);
1311 ret = 0;
1312out:
1313 write_unlock_bh(&nf_conntrack_lock);
1314 return ret;
1315}
1316
9fb9cbb1
YK
1317int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
1318{
1319 int ret;
1320 BUG_ON(me->timeout == 0);
1321
1322 ret = nf_conntrack_register_cache(NF_CT_F_HELP, "nf_conntrack:help",
1323 sizeof(struct nf_conn)
dc808fe2
HW
1324 + sizeof(struct nf_conn_help)
1325 + __alignof__(struct nf_conn_help));
9fb9cbb1
YK
1326 if (ret < 0) {
1327 printk(KERN_ERR "nf_conntrack_helper_reigster: Unable to create slab cache for conntracks\n");
1328 return ret;
1329 }
1330 write_lock_bh(&nf_conntrack_lock);
1331 list_prepend(&helpers, me);
1332 write_unlock_bh(&nf_conntrack_lock);
1333
1334 return 0;
1335}
1336
c1d10adb
PNA
1337struct nf_conntrack_helper *
1338__nf_conntrack_helper_find_byname(const char *name)
1339{
1340 struct nf_conntrack_helper *h;
1341
1342 list_for_each_entry(h, &helpers, list) {
1343 if (!strcmp(h->name, name))
1344 return h;
1345 }
1346
1347 return NULL;
1348}
1349
9fb9cbb1
YK
1350static inline int unhelp(struct nf_conntrack_tuple_hash *i,
1351 const struct nf_conntrack_helper *me)
1352{
dc808fe2
HW
1353 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i);
1354 struct nf_conn_help *help = nfct_help(ct);
1355
1356 if (help && help->helper == me) {
1357 nf_conntrack_event(IPCT_HELPER, ct);
1358 help->helper = NULL;
9fb9cbb1
YK
1359 }
1360 return 0;
1361}
1362
1363void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
1364{
1365 unsigned int i;
1366 struct nf_conntrack_expect *exp, *tmp;
1367
1368 /* Need write lock here, to delete helper. */
1369 write_lock_bh(&nf_conntrack_lock);
1370 LIST_DELETE(&helpers, me);
1371
1372 /* Get rid of expectations */
1373 list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, list) {
dc808fe2
HW
1374 struct nf_conn_help *help = nfct_help(exp->master);
1375 if (help->helper == me && del_timer(&exp->timeout)) {
9fb9cbb1
YK
1376 nf_ct_unlink_expect(exp);
1377 nf_conntrack_expect_put(exp);
1378 }
1379 }
1380
1381 /* Get rid of expecteds, set helpers to NULL. */
1382 LIST_FIND_W(&unconfirmed, unhelp, struct nf_conntrack_tuple_hash*, me);
1383 for (i = 0; i < nf_conntrack_htable_size; i++)
1384 LIST_FIND_W(&nf_conntrack_hash[i], unhelp,
1385 struct nf_conntrack_tuple_hash *, me);
1386 write_unlock_bh(&nf_conntrack_lock);
1387
1388 /* Someone could be still looking at the helper in a bh. */
1389 synchronize_net();
1390}
1391
1392/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1393void __nf_ct_refresh_acct(struct nf_conn *ct,
1394 enum ip_conntrack_info ctinfo,
1395 const struct sk_buff *skb,
1396 unsigned long extra_jiffies,
1397 int do_acct)
1398{
1399 int event = 0;
1400
1401 NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
1402 NF_CT_ASSERT(skb);
1403
1404 write_lock_bh(&nf_conntrack_lock);
1405
1406 /* If not in hash table, timer will not be active yet */
1407 if (!nf_ct_is_confirmed(ct)) {
1408 ct->timeout.expires = extra_jiffies;
1409 event = IPCT_REFRESH;
1410 } else {
1411 /* Need del_timer for race avoidance (may already be dying). */
1412 if (del_timer(&ct->timeout)) {
1413 ct->timeout.expires = jiffies + extra_jiffies;
1414 add_timer(&ct->timeout);
1415 event = IPCT_REFRESH;
1416 }
1417 }
1418
1419#ifdef CONFIG_NF_CT_ACCT
1420 if (do_acct) {
1421 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1422 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1423 skb->len - (unsigned int)(skb->nh.raw - skb->data);
1424 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1425 || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1426 event |= IPCT_COUNTER_FILLING;
1427 }
1428#endif
1429
1430 write_unlock_bh(&nf_conntrack_lock);
1431
1432 /* must be unlocked when calling event cache */
1433 if (event)
1434 nf_conntrack_event_cache(event, skb);
1435}
1436
c1d10adb
PNA
1437#if defined(CONFIG_NF_CT_NETLINK) || \
1438 defined(CONFIG_NF_CT_NETLINK_MODULE)
1439
1440#include <linux/netfilter/nfnetlink.h>
1441#include <linux/netfilter/nfnetlink_conntrack.h>
57b47a53
IM
1442#include <linux/mutex.h>
1443
c1d10adb
PNA
1444
1445/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1446 * in ip_conntrack_core, since we don't want the protocols to autoload
1447 * or depend on ctnetlink */
1448int nf_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1449 const struct nf_conntrack_tuple *tuple)
1450{
1451 NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1452 &tuple->src.u.tcp.port);
1453 NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1454 &tuple->dst.u.tcp.port);
1455 return 0;
1456
1457nfattr_failure:
1458 return -1;
1459}
1460
1461static const size_t cta_min_proto[CTA_PROTO_MAX] = {
1462 [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t),
1463 [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t)
1464};
1465
1466int nf_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1467 struct nf_conntrack_tuple *t)
1468{
1469 if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1470 return -EINVAL;
1471
1472 if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
1473 return -EINVAL;
1474
1475 t->src.u.tcp.port =
1476 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1477 t->dst.u.tcp.port =
1478 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1479
1480 return 0;
1481}
1482#endif
1483
9fb9cbb1
YK
1484/* Used by ipt_REJECT and ip6t_REJECT. */
1485void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1486{
1487 struct nf_conn *ct;
1488 enum ip_conntrack_info ctinfo;
1489
1490 /* This ICMP is in reverse direction to the packet which caused it */
1491 ct = nf_ct_get(skb, &ctinfo);
1492 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1493 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1494 else
1495 ctinfo = IP_CT_RELATED;
1496
1497 /* Attach to new skbuff, and increment count */
1498 nskb->nfct = &ct->ct_general;
1499 nskb->nfctinfo = ctinfo;
1500 nf_conntrack_get(nskb->nfct);
1501}
1502
1503static inline int
1504do_iter(const struct nf_conntrack_tuple_hash *i,
1505 int (*iter)(struct nf_conn *i, void *data),
1506 void *data)
1507{
1508 return iter(nf_ct_tuplehash_to_ctrack(i), data);
1509}
1510
1511/* Bring out ya dead! */
1512static struct nf_conntrack_tuple_hash *
1513get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
1514 void *data, unsigned int *bucket)
1515{
1516 struct nf_conntrack_tuple_hash *h = NULL;
1517
1518 write_lock_bh(&nf_conntrack_lock);
1519 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
1520 h = LIST_FIND_W(&nf_conntrack_hash[*bucket], do_iter,
1521 struct nf_conntrack_tuple_hash *, iter, data);
1522 if (h)
1523 break;
1524 }
1525 if (!h)
1526 h = LIST_FIND_W(&unconfirmed, do_iter,
1527 struct nf_conntrack_tuple_hash *, iter, data);
1528 if (h)
1529 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
1530 write_unlock_bh(&nf_conntrack_lock);
1531
1532 return h;
1533}
1534
1535void
1536nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data)
1537{
1538 struct nf_conntrack_tuple_hash *h;
1539 unsigned int bucket = 0;
1540
1541 while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1542 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
1543 /* Time to push up daises... */
1544 if (del_timer(&ct->timeout))
1545 death_by_timeout((unsigned long)ct);
1546 /* ... else the timer will get him soon. */
1547
1548 nf_ct_put(ct);
1549 }
1550}
1551
1552static int kill_all(struct nf_conn *i, void *data)
1553{
1554 return 1;
1555}
1556
1557static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size)
1558{
1559 if (vmalloced)
1560 vfree(hash);
1561 else
1562 free_pages((unsigned long)hash,
1563 get_order(sizeof(struct list_head) * size));
1564}
1565
c1d10adb
PNA
1566void nf_conntrack_flush()
1567{
1568 nf_ct_iterate_cleanup(kill_all, NULL);
1569}
1570
9fb9cbb1
YK
1571/* Mishearing the voices in his head, our hero wonders how he's
1572 supposed to kill the mall. */
1573void nf_conntrack_cleanup(void)
1574{
1575 int i;
1576
7d3cdc6b
YK
1577 ip_ct_attach = NULL;
1578
9fb9cbb1
YK
1579 /* This makes sure all current packets have passed through
1580 netfilter framework. Roll on, two-stage module
1581 delete... */
1582 synchronize_net();
1583
1584 nf_ct_event_cache_flush();
1585 i_see_dead_people:
c1d10adb 1586 nf_conntrack_flush();
9fb9cbb1
YK
1587 if (atomic_read(&nf_conntrack_count) != 0) {
1588 schedule();
1589 goto i_see_dead_people;
1590 }
6636568c
PM
1591 /* wait until all references to nf_conntrack_untracked are dropped */
1592 while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1)
1593 schedule();
9fb9cbb1
YK
1594
1595 for (i = 0; i < NF_CT_F_NUM; i++) {
1596 if (nf_ct_cache[i].use == 0)
1597 continue;
1598
1599 NF_CT_ASSERT(nf_ct_cache[i].use == 1);
1600 nf_ct_cache[i].use = 1;
1601 nf_conntrack_unregister_cache(i);
1602 }
1603 kmem_cache_destroy(nf_conntrack_expect_cachep);
1604 free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1605 nf_conntrack_htable_size);
5a6f294e
KK
1606
1607 /* free l3proto protocol tables */
1608 for (i = 0; i < PF_MAX; i++)
1609 if (nf_ct_protos[i]) {
1610 kfree(nf_ct_protos[i]);
1611 nf_ct_protos[i] = NULL;
1612 }
9fb9cbb1
YK
1613}
1614
1615static struct list_head *alloc_hashtable(int size, int *vmalloced)
1616{
1617 struct list_head *hash;
1618 unsigned int i;
1619
1620 *vmalloced = 0;
1621 hash = (void*)__get_free_pages(GFP_KERNEL,
1622 get_order(sizeof(struct list_head)
1623 * size));
1624 if (!hash) {
1625 *vmalloced = 1;
1626 printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
1627 hash = vmalloc(sizeof(struct list_head) * size);
1628 }
1629
1630 if (hash)
1631 for (i = 0; i < size; i++)
1632 INIT_LIST_HEAD(&hash[i]);
1633
1634 return hash;
1635}
1636
1637int set_hashsize(const char *val, struct kernel_param *kp)
1638{
1639 int i, bucket, hashsize, vmalloced;
1640 int old_vmalloced, old_size;
1641 int rnd;
1642 struct list_head *hash, *old_hash;
1643 struct nf_conntrack_tuple_hash *h;
1644
1645 /* On boot, we can set this without any fancy locking. */
1646 if (!nf_conntrack_htable_size)
1647 return param_set_uint(val, kp);
1648
1649 hashsize = simple_strtol(val, NULL, 0);
1650 if (!hashsize)
1651 return -EINVAL;
1652
1653 hash = alloc_hashtable(hashsize, &vmalloced);
1654 if (!hash)
1655 return -ENOMEM;
1656
1657 /* We have to rehahs for the new table anyway, so we also can
1658 * use a newrandom seed */
1659 get_random_bytes(&rnd, 4);
1660
1661 write_lock_bh(&nf_conntrack_lock);
1662 for (i = 0; i < nf_conntrack_htable_size; i++) {
1663 while (!list_empty(&nf_conntrack_hash[i])) {
1664 h = list_entry(nf_conntrack_hash[i].next,
1665 struct nf_conntrack_tuple_hash, list);
1666 list_del(&h->list);
1667 bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1668 list_add_tail(&h->list, &hash[bucket]);
1669 }
1670 }
1671 old_size = nf_conntrack_htable_size;
1672 old_vmalloced = nf_conntrack_vmalloc;
1673 old_hash = nf_conntrack_hash;
1674
1675 nf_conntrack_htable_size = hashsize;
1676 nf_conntrack_vmalloc = vmalloced;
1677 nf_conntrack_hash = hash;
1678 nf_conntrack_hash_rnd = rnd;
1679 write_unlock_bh(&nf_conntrack_lock);
1680
1681 free_conntrack_hash(old_hash, old_vmalloced, old_size);
1682 return 0;
1683}
1684
1685module_param_call(hashsize, set_hashsize, param_get_uint,
1686 &nf_conntrack_htable_size, 0600);
1687
1688int __init nf_conntrack_init(void)
1689{
1690 unsigned int i;
1691 int ret;
1692
1693 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1694 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1695 if (!nf_conntrack_htable_size) {
1696 nf_conntrack_htable_size
1697 = (((num_physpages << PAGE_SHIFT) / 16384)
1698 / sizeof(struct list_head));
1699 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1700 nf_conntrack_htable_size = 8192;
1701 if (nf_conntrack_htable_size < 16)
1702 nf_conntrack_htable_size = 16;
1703 }
1704 nf_conntrack_max = 8 * nf_conntrack_htable_size;
1705
1706 printk("nf_conntrack version %s (%u buckets, %d max)\n",
1707 NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
1708 nf_conntrack_max);
1709
1710 nf_conntrack_hash = alloc_hashtable(nf_conntrack_htable_size,
1711 &nf_conntrack_vmalloc);
1712 if (!nf_conntrack_hash) {
1713 printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
1714 goto err_out;
1715 }
1716
1717 ret = nf_conntrack_register_cache(NF_CT_F_BASIC, "nf_conntrack:basic",
dc808fe2 1718 sizeof(struct nf_conn));
9fb9cbb1
YK
1719 if (ret < 0) {
1720 printk(KERN_ERR "Unable to create nf_conn slab cache\n");
1721 goto err_free_hash;
1722 }
1723
1724 nf_conntrack_expect_cachep = kmem_cache_create("nf_conntrack_expect",
1725 sizeof(struct nf_conntrack_expect),
1726 0, 0, NULL, NULL);
1727 if (!nf_conntrack_expect_cachep) {
1728 printk(KERN_ERR "Unable to create nf_expect slab cache\n");
1729 goto err_free_conntrack_slab;
1730 }
1731
1732 /* Don't NEED lock here, but good form anyway. */
1733 write_lock_bh(&nf_conntrack_lock);
1734 for (i = 0; i < PF_MAX; i++)
1735 nf_ct_l3protos[i] = &nf_conntrack_generic_l3proto;
1736 write_unlock_bh(&nf_conntrack_lock);
1737
7d3cdc6b
YK
1738 /* For use by REJECT target */
1739 ip_ct_attach = __nf_conntrack_attach;
1740
9fb9cbb1
YK
1741 /* Set up fake conntrack:
1742 - to never be deleted, not in any hashes */
1743 atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
1744 /* - and look it like as a confirmed connection */
1745 set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
1746
1747 return ret;
1748
1749err_free_conntrack_slab:
1750 nf_conntrack_unregister_cache(NF_CT_F_BASIC);
1751err_free_hash:
1752 free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1753 nf_conntrack_htable_size);
1754err_out:
1755 return -ENOMEM;
1756}